--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -15,6 +15,7 @@ from auto_archiver.core import Extractor, Enricher, Metadata, Media from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin from auto_archiver.utils.misc import random_str from auto_archiver.utils.url import is_relevant_url +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted class AntibotExtractorEnricher(Extractor, Enricher): @@ -97,9 +98,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future dropin = self._get_suitable_dropin(url, sb) if not dropin.open_page(url): - # TODO: could we detect deleted videos? - logger.warning("Failed to open drop-in page") + # Check for deletion indicators + page_title = sb.get_title() + html_source = sb.get_page_source() + deletion_info = detect_deletion( + html_content=html_source, + page_title=page_title, + url=url + ) + if deletion_info: + flag_as_deleted(to_enrich, deletion_info) + return to_enrich + logger.warning("Failed to open drop-in page (not detected as deleted)") return False if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)): @@ -109,7 +119,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): sb.wait_for_ready_state_complete() sb.sleep(1) # margin for the page to load completely - to_enrich.set_title(sb.get_title()) + page_title = sb.get_title() + html_source = sb.get_page_source() + + # Check if the page indicates content was deleted + deletion_info = detect_deletion( + html_content=html_source, + page_title=page_title, + url=url + ) + if deletion_info: + flag_as_deleted(to_enrich, deletion_info) + + to_enrich.set_title(page_title) self._enrich_html_source_code(sb, to_enrich) self._enrich_full_page_screenshot(sb, to_enrich) --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -19,6 +19,7 @@ from auto_archiver.utils.custom_logger import logger from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media from auto_archiver.utils import get_datetime_from_str from auto_archiver.utils.misc import ydl_entry_to_filename +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted from .dropin import GenericDropin @@ -481,6 +482,15 @@ class GenericExtractor(Extractor): raise SkipYtdlp() # don't download since it can be a live stream data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False) + + # Check for deletion indicators in video data + deletion_info = detect_deletion( + video_data=data, + url=url + ) + if deletion_info: + result = Metadata() + flag_as_deleted(result, deletion_info) + return result result = _helper_for_successful_extract_info(data, info_extractor, url, ydl) @@ -505,6 +515,12 @@ class GenericExtractor(Extractor): try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: + # Check if the error indicates deletion + deletion_info = detect_deletion(error_message=str(post_e), url=url) + if deletion_info: + result = Metadata() + flag_as_deleted(result, deletion_info) + return result + if "NSFW tweet requires authentication." in str(post_e): logger.warning(str(post_e)) return False --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -7,6 +7,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor @@ -36,9 +37,18 @@ class Twitter(GenericDropin): def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() try: if not tweet.get("user") or not tweet.get("created_at"): - raise ValueError("Error retreiving post. Are you sure it exists?") + # Check for deletion indicators + deletion_info = detect_deletion( + video_data=tweet, + url=url, + error_message="Missing user or created_at fields" + ) + if deletion_info: + flag_as_deleted(result, deletion_info) + return result + + raise ValueError("Error retrieving post. Are you sure it exists?") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")