From a89d0193e47278723a37f353d72ef996ceaa5cf7 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 8 Jan 2026 15:02:00 +0000 Subject: [PATCH] removes patch file --- deletion-detection.patch | 129 --------------------------------------- 1 file changed, 129 deletions(-) delete mode 100644 deletion-detection.patch diff --git a/deletion-detection.patch b/deletion-detection.patch deleted file mode 100644 index b5993d3..0000000 --- a/deletion-detection.patch +++ /dev/null @@ -1,129 +0,0 @@ ---- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py -+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py -@@ -15,6 +15,7 @@ from auto_archiver.core import Extractor, Enricher, Metadata, Media - from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin - from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin - from auto_archiver.utils.misc import random_str - from auto_archiver.utils.url import is_relevant_url -+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted - - - class AntibotExtractorEnricher(Extractor, Enricher): -@@ -97,9 +98,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): - sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future - - dropin = self._get_suitable_dropin(url, sb) - if not dropin.open_page(url): -- # TODO: could we detect deleted videos? -- logger.warning("Failed to open drop-in page") -+ # Check for deletion indicators -+ page_title = sb.get_title() -+ html_source = sb.get_page_source() -+ deletion_info = detect_deletion( -+ html_content=html_source, -+ page_title=page_title, -+ url=url -+ ) -+ if deletion_info: -+ flag_as_deleted(to_enrich, deletion_info) -+ return to_enrich -+ logger.warning("Failed to open drop-in page (not detected as deleted)") - return False - - if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)): -@@ -109,7 +119,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): - sb.wait_for_ready_state_complete() - sb.sleep(1) # margin for the page to load completely - -- to_enrich.set_title(sb.get_title()) -+ page_title = sb.get_title() -+ html_source = sb.get_page_source() -+ -+ # Check if the page indicates content was deleted -+ deletion_info = detect_deletion( -+ html_content=html_source, -+ page_title=page_title, -+ url=url -+ ) -+ if deletion_info: -+ flag_as_deleted(to_enrich, deletion_info) -+ -+ to_enrich.set_title(page_title) - self._enrich_html_source_code(sb, to_enrich) - - self._enrich_full_page_screenshot(sb, to_enrich) ---- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py -+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py -@@ -19,6 +19,7 @@ from auto_archiver.utils.custom_logger import logger - - from auto_archiver.core.extractor import Extractor - from auto_archiver.core import Metadata, Media - from auto_archiver.utils import get_datetime_from_str - from auto_archiver.utils.misc import ydl_entry_to_filename -+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted - from .dropin import GenericDropin - - -@@ -481,6 +482,15 @@ class GenericExtractor(Extractor): - raise SkipYtdlp() - - # don't download since it can be a live stream - data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False) -+ -+ # Check for deletion indicators in video data -+ deletion_info = detect_deletion( -+ video_data=data, -+ url=url -+ ) -+ if deletion_info: -+ result = Metadata() -+ flag_as_deleted(result, deletion_info) -+ return result - - result = _helper_for_successful_extract_info(data, info_extractor, url, ydl) - -@@ -505,6 +515,12 @@ class GenericExtractor(Extractor): - try: - result = self.get_metadata_for_post(info_extractor, url, ydl) - except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: -+ # Check if the error indicates deletion -+ deletion_info = detect_deletion(error_message=str(post_e), url=url) -+ if deletion_info: -+ result = Metadata() -+ flag_as_deleted(result, deletion_info) -+ return result -+ - if "NSFW tweet requires authentication." in str(post_e): - logger.warning(str(post_e)) - return False ---- a/src/auto_archiver/modules/generic_extractor/twitter.py -+++ b/src/auto_archiver/modules/generic_extractor/twitter.py -@@ -7,6 +7,7 @@ from slugify import slugify - - from auto_archiver.core.metadata import Metadata, Media - from auto_archiver.utils import url as UrlUtil, get_datetime_from_str - from auto_archiver.core.extractor import Extractor -+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted - from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor - - -@@ -36,9 +37,18 @@ class Twitter(GenericDropin): - def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: - result = Metadata() - try: - if not tweet.get("user") or not tweet.get("created_at"): -- raise ValueError("Error retreiving post. Are you sure it exists?") -+ # Check for deletion indicators -+ deletion_info = detect_deletion( -+ video_data=tweet, -+ url=url, -+ error_message="Missing user or created_at fields" -+ ) -+ if deletion_info: -+ flag_as_deleted(result, deletion_info) -+ return result -+ -+ raise ValueError("Error retrieving post. Are you sure it exists?") - timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") - except (ValueError, KeyError) as ex: - logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")