diff --git a/deletion-detection.patch b/deletion-detection.patch new file mode 100644 index 0000000..b5993d3 --- /dev/null +++ b/deletion-detection.patch @@ -0,0 +1,129 @@ +--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py ++++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +@@ -15,6 +15,7 @@ from auto_archiver.core import Extractor, Enricher, Metadata, Media + from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin + from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin + from auto_archiver.utils.misc import random_str + from auto_archiver.utils.url import is_relevant_url ++from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted + + + class AntibotExtractorEnricher(Extractor, Enricher): +@@ -97,9 +98,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): + sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future + + dropin = self._get_suitable_dropin(url, sb) + if not dropin.open_page(url): +- # TODO: could we detect deleted videos? +- logger.warning("Failed to open drop-in page") ++ # Check for deletion indicators ++ page_title = sb.get_title() ++ html_source = sb.get_page_source() ++ deletion_info = detect_deletion( ++ html_content=html_source, ++ page_title=page_title, ++ url=url ++ ) ++ if deletion_info: ++ flag_as_deleted(to_enrich, deletion_info) ++ return to_enrich ++ logger.warning("Failed to open drop-in page (not detected as deleted)") + return False + + if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)): +@@ -109,7 +119,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): + sb.wait_for_ready_state_complete() + sb.sleep(1) # margin for the page to load completely + +- to_enrich.set_title(sb.get_title()) ++ page_title = sb.get_title() ++ html_source = sb.get_page_source() ++ ++ # Check if the page indicates content was deleted ++ deletion_info = detect_deletion( ++ html_content=html_source, ++ page_title=page_title, ++ url=url ++ ) ++ if deletion_info: ++ flag_as_deleted(to_enrich, deletion_info) ++ ++ to_enrich.set_title(page_title) + self._enrich_html_source_code(sb, to_enrich) + + self._enrich_full_page_screenshot(sb, to_enrich) +--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py ++++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py +@@ -19,6 +19,7 @@ from auto_archiver.utils.custom_logger import logger + + from auto_archiver.core.extractor import Extractor + from auto_archiver.core import Metadata, Media + from auto_archiver.utils import get_datetime_from_str + from auto_archiver.utils.misc import ydl_entry_to_filename ++from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted + from .dropin import GenericDropin + + +@@ -481,6 +482,15 @@ class GenericExtractor(Extractor): + raise SkipYtdlp() + + # don't download since it can be a live stream + data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False) ++ ++ # Check for deletion indicators in video data ++ deletion_info = detect_deletion( ++ video_data=data, ++ url=url ++ ) ++ if deletion_info: ++ result = Metadata() ++ flag_as_deleted(result, deletion_info) ++ return result + + result = _helper_for_successful_extract_info(data, info_extractor, url, ydl) + +@@ -505,6 +515,12 @@ class GenericExtractor(Extractor): + try: + result = self.get_metadata_for_post(info_extractor, url, ydl) + except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: ++ # Check if the error indicates deletion ++ deletion_info = detect_deletion(error_message=str(post_e), url=url) ++ if deletion_info: ++ result = Metadata() ++ flag_as_deleted(result, deletion_info) ++ return result ++ + if "NSFW tweet requires authentication." in str(post_e): + logger.warning(str(post_e)) + return False +--- a/src/auto_archiver/modules/generic_extractor/twitter.py ++++ b/src/auto_archiver/modules/generic_extractor/twitter.py +@@ -7,6 +7,7 @@ from slugify import slugify + + from auto_archiver.core.metadata import Metadata, Media + from auto_archiver.utils import url as UrlUtil, get_datetime_from_str + from auto_archiver.core.extractor import Extractor ++from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted + from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor + + +@@ -36,9 +37,18 @@ class Twitter(GenericDropin): + def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: + result = Metadata() + try: + if not tweet.get("user") or not tweet.get("created_at"): +- raise ValueError("Error retreiving post. Are you sure it exists?") ++ # Check for deletion indicators ++ deletion_info = detect_deletion( ++ video_data=tweet, ++ url=url, ++ error_message="Missing user or created_at fields" ++ ) ++ if deletion_info: ++ flag_as_deleted(result, deletion_info) ++ return result ++ ++ raise ValueError("Error retrieving post. Are you sure it exists?") + timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + except (ValueError, KeyError) as ex: + logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") diff --git a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py index ba1dfda..6d2881a 100644 --- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py +++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py @@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin from auto_archiver.utils.misc import random_str from auto_archiver.utils.url import is_relevant_url +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted class AntibotExtractorEnricher(Extractor, Enricher): @@ -98,8 +99,18 @@ class AntibotExtractorEnricher(Extractor, Enricher): dropin = self._get_suitable_dropin(url, sb) if not dropin.open_page(url): - # TODO: could we detect deleted videos? - logger.warning("Failed to open drop-in page") + # Check for deletion indicators + page_title = sb.get_title() + html_source = sb.get_page_source() + deletion_info = detect_deletion( + html_content=html_source, + page_title=page_title, + url=url + ) + if deletion_info: + flag_as_deleted(to_enrich, deletion_info) + return to_enrich + logger.warning("Failed to open drop-in page (not detected as deleted)") return False if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)): @@ -109,7 +120,19 @@ class AntibotExtractorEnricher(Extractor, Enricher): sb.wait_for_ready_state_complete() sb.sleep(1) # margin for the page to load completely - to_enrich.set_title(sb.get_title()) + page_title = sb.get_title() + html_source = sb.get_page_source() + + # Check if the page indicates content was deleted + deletion_info = detect_deletion( + html_content=html_source, + page_title=page_title, + url=url + ) + if deletion_info: + flag_as_deleted(to_enrich, deletion_info) + + to_enrich.set_title(page_title) self._enrich_html_source_code(sb, to_enrich) self._enrich_full_page_screenshot(sb, to_enrich) diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index dae9381..66f1c87 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor from auto_archiver.core import Metadata, Media from auto_archiver.utils import get_datetime_from_str from auto_archiver.utils.misc import ydl_entry_to_filename +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted from .dropin import GenericDropin @@ -484,6 +485,16 @@ class GenericExtractor(Extractor): # don't download since it can be a live stream data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False) + # Check for deletion indicators in video data + deletion_info = detect_deletion( + video_data=data, + url=url + ) + if deletion_info: + result = Metadata() + flag_as_deleted(result, deletion_info) + return result + result = _helper_for_successful_extract_info(data, info_extractor, url, ydl) except MaxDownloadsReached: @@ -503,6 +514,13 @@ class GenericExtractor(Extractor): try: result = self.get_metadata_for_post(info_extractor, url, ydl) except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e: + # Check if the error indicates deletion + deletion_info = detect_deletion(error_message=str(post_e), url=url) + if deletion_info: + result = Metadata() + flag_as_deleted(result, deletion_info) + return result + if "NSFW tweet requires authentication." in str(post_e): logger.warning(str(post_e)) return False diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 5153f1c..2308add 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -7,6 +7,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import url as UrlUtil, get_datetime_from_str from auto_archiver.core.extractor import Extractor +from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor @@ -37,7 +38,17 @@ class Twitter(GenericDropin): result = Metadata() try: if not tweet.get("user") or not tweet.get("created_at"): - raise ValueError("Error retreiving post. Are you sure it exists?") + # Check for deletion indicators + deletion_info = detect_deletion( + video_data=tweet, + url=url, + error_message="Missing user or created_at fields" + ) + if deletion_info: + flag_as_deleted(result, deletion_info) + return result + + raise ValueError("Error retrieving post. Are you sure it exists?") timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") except (ValueError, KeyError) as ex: logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}") diff --git a/src/auto_archiver/utils/deletion_detection.py b/src/auto_archiver/utils/deletion_detection.py new file mode 100644 index 0000000..e72c759 --- /dev/null +++ b/src/auto_archiver/utils/deletion_detection.py @@ -0,0 +1,299 @@ +""" +Deletion Detection Utilities + +Provides comprehensive detection of deleted, missing, or unavailable content +across various social media platforms. Critical for evidence preservation in +conflict documentation and human rights investigations. + +This module helps investigators identify when content has been removed, +allowing them to: +- Document that evidence existed but was deleted +- Track patterns of content removal +- Preserve metadata about missing content +""" + +from typing import Optional, Dict, List +from auto_archiver.utils.custom_logger import logger + + +class DeletionIndicators: + """ + Platform-specific indicators that content has been deleted or is unavailable. + + Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok, + YouTube, Reddit, and VK. Used by conflict investigators to detect when + evidence has been removed. + """ + + # Twitter/X deletion indicators + TWITTER = [ + "Hmm...this page doesn't exist", + "Try searching for something else", + "This Tweet is unavailable", + "This account doesn't exist", + "This Tweet has been deleted", + "This account has been suspended", + "Sorry, that page doesn't exist", + "The Tweet you're looking for isn't available", + ] + + # Facebook deletion indicators + FACEBOOK = [ + "This content isn't available", + "Sorry, this content isn't available", + "This content is no longer available", + "The link you followed may be broken", + "Page Not Found", + "Content Not Found", + "This content is no longer on Facebook", + ] + + # Instagram deletion indicators + INSTAGRAM = [ + "Sorry, this page isn't available", + "The link you followed may be broken", + "Media not found or unavailable", + "This post is no longer available", + "This account is private", + ] + + # TikTok deletion indicators + TIKTOK = [ + "Couldn't find this account", + "This video is no longer available", + "This video is currently unavailable", + "Video not found", + "This video may have been deleted", + ] + + # YouTube deletion indicators + YOUTUBE = [ + "This video isn't available anymore", + "Video unavailable", + "This video has been removed", + "This video is no longer available", + "This video is private", + "This video has been removed by the uploader", + "This video has been deleted", + ] + + # Reddit deletion indicators + REDDIT = [ + "this post has been removed", + "this comment has been removed", + "[removed]", + "[deleted]", + "page not found", + "there doesn't seem to be anything here", + ] + + # VK deletion indicators + VK = [ + "Post deleted", + "Page not found", + "Content unavailable", + "Access denied", + ] + + # Telegram deletion indicators + TELEGRAM = [ + "Message not found", + "Deleted message", + "Channel is private", + ] + + # Generic indicators (work across platforms) + GENERIC = [ + "404", + "not found", + "unavailable", + "doesn't exist", + "has been removed", + "no longer available", + "content removed", + "access denied", + "page not found", + ] + + @classmethod + def all_indicators(cls) -> List[str]: + """Returns all deletion indicators from all platforms.""" + return ( + cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK + + cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC + ) + + @classmethod + def for_url(cls, url: str) -> List[str]: + """Returns platform-specific indicators based on URL domain.""" + url_lower = url.lower() + + if "twitter.com" in url_lower or "x.com" in url_lower: + return cls.TWITTER + cls.GENERIC + elif "facebook.com" in url_lower or "fb.com" in url_lower: + return cls.FACEBOOK + cls.GENERIC + elif "instagram.com" in url_lower: + return cls.INSTAGRAM + cls.GENERIC + elif "tiktok.com" in url_lower: + return cls.TIKTOK + cls.GENERIC + elif "youtube.com" in url_lower or "youtu.be" in url_lower: + return cls.YOUTUBE + cls.GENERIC + elif "reddit.com" in url_lower: + return cls.REDDIT + cls.GENERIC + elif "vk.com" in url_lower: + return cls.VK + cls.GENERIC + elif "t.me" in url_lower: + return cls.TELEGRAM + cls.GENERIC + else: + return cls.GENERIC + + +def detect_deletion( + html_content: str = None, + page_title: str = None, + error_message: str = None, + url: str = None, + video_data: dict = None +) -> Optional[Dict[str, any]]: + """ + Comprehensive deletion detection across multiple signals. + + Checks HTML content, page titles, error messages, and video metadata for + indicators that content has been deleted or is unavailable. + + Args: + html_content: Raw HTML source of the page + page_title: Browser page title + error_message: Any error message from the extractor + url: The URL being archived (for platform-specific detection) + video_data: Video metadata from yt-dlp or other extractors + + Returns: + Dictionary with deletion details if detected, None otherwise. + Format: { + "is_deleted": True, + "indicator": "specific text that was found", + "source": "html|title|error|metadata", + "platform": "twitter|facebook|etc" + } + """ + + # Determine indicators to check based on URL + if url: + indicators = DeletionIndicators.for_url(url) + platform = _extract_platform(url) + else: + indicators = DeletionIndicators.all_indicators() + platform = "unknown" + + # Check HTML content + if html_content: + for indicator in indicators: + if indicator.lower() in html_content.lower(): + logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}") + return { + "is_deleted": True, + "indicator": indicator, + "source": "html_content", + "platform": platform + } + + # Check page title + if page_title: + for indicator in indicators: + if indicator.lower() in page_title.lower(): + logger.info(f"Deletion detected in page title: '{indicator}' found for {url}") + return { + "is_deleted": True, + "indicator": indicator, + "source": "page_title", + "platform": platform + } + + # Check error messages + if error_message: + for indicator in indicators: + if indicator.lower() in str(error_message).lower(): + logger.info(f"Deletion detected in error: '{indicator}' found for {url}") + return { + "is_deleted": True, + "indicator": indicator, + "source": "error_message", + "platform": platform + } + + # Check video metadata (from yt-dlp) + if video_data: + # Check if yt-dlp flagged it as unavailable + if video_data.get("availability") in ["unavailable", "private", "deleted"]: + logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}") + return { + "is_deleted": True, + "indicator": f"availability: {video_data.get('availability')}", + "source": "video_metadata", + "platform": platform + } + + # Check description/title for deletion indicators + for key in ["title", "description", "fulltitle"]: + if key in video_data: + for indicator in indicators: + if indicator.lower() in str(video_data[key]).lower(): + logger.info(f"Deletion detected in {key}: '{indicator}'") + return { + "is_deleted": True, + "indicator": indicator, + "source": f"video_metadata_{key}", + "platform": platform + } + + return None + + +def _extract_platform(url: str) -> str: + """Extracts platform name from URL.""" + url_lower = url.lower() + + if "twitter.com" in url_lower or "x.com" in url_lower: + return "twitter" + elif "facebook.com" in url_lower or "fb.com" in url_lower: + return "facebook" + elif "instagram.com" in url_lower: + return "instagram" + elif "tiktok.com" in url_lower: + return "tiktok" + elif "youtube.com" in url_lower or "youtu.be" in url_lower: + return "youtube" + elif "reddit.com" in url_lower: + return "reddit" + elif "vk.com" in url_lower: + return "vk" + elif "t.me" in url_lower: + return "telegram" + else: + return "unknown" + + +def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None: + """ + Flags metadata object as deleted/unavailable. + + Adds detailed deletion information to the metadata object so investigators + know exactly why and how the deletion was detected. + + Args: + metadata: Metadata object to update + deletion_info: Dictionary from detect_deletion() + """ + metadata.set("deletion_detected", True) + metadata.set("deletion_indicator", deletion_info.get("indicator")) + metadata.set("deletion_source", deletion_info.get("source")) + metadata.set("deletion_platform", deletion_info.get("platform")) + metadata.status = "deleted_or_unavailable" + + logger.warning( + f"Content marked as deleted/unavailable: " + f"platform={deletion_info.get('platform')}, " + f"indicator='{deletion_info.get('indicator')}', " + f"source={deletion_info.get('source')}" + ) diff --git a/tests/test_deletion_detection.py b/tests/test_deletion_detection.py new file mode 100644 index 0000000..70186dc --- /dev/null +++ b/tests/test_deletion_detection.py @@ -0,0 +1,169 @@ +""" +Tests for deletion detection utilities. + +These tests verify that the auto-archiver can detect when content +has been deleted or is unavailable across various platforms. +Critical for evidence preservation in conflict documentation. +""" + +import pytest +from auto_archiver.utils.deletion_detection import ( + detect_deletion, + flag_as_deleted, + DeletionIndicators +) +from auto_archiver.core.metadata import Metadata + + +class TestDeletionIndicators: + """Test the deletion indicator lists for various platforms.""" + + def test_twitter_indicators(self): + """Verify Twitter deletion indicators are comprehensive.""" + assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER + assert "Try searching for something else" in DeletionIndicators.TWITTER + assert "This Tweet is unavailable" in DeletionIndicators.TWITTER + + def test_platform_specific_indicators(self): + """Test that platform-specific indicators are returned based on URL.""" + twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123") + assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators) + + instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123") + assert any("page isn't available" in ind.lower() for ind in instagram_indicators) + + +class TestDetectDeletion: + """Test the detect_deletion function with various inputs.""" + + def test_detect_deletion_in_html_twitter(self): + """Test detection of Twitter's deleted post page.""" + html = "Hmm...this page doesn't exist. Try searching for something else." + url = "https://twitter.com/user/status/123" + + result = detect_deletion(html_content=html, url=url) + + assert result is not None + assert result["is_deleted"] is True + assert result["platform"] == "twitter" + assert result["source"] == "html_content" + assert "page doesn't exist" in result["indicator"].lower() + + def test_detect_deletion_in_page_title(self): + """Test detection via page title.""" + title = "Page Not Found" + url = "https://facebook.com/post/123" + + result = detect_deletion(page_title=title, url=url) + + assert result is not None + assert result["is_deleted"] is True + assert result["source"] == "page_title" + + def test_detect_deletion_in_error_message(self): + """Test detection via error messages.""" + error = "yt_dlp.utils.DownloadError: This video is no longer available" + url = "https://youtube.com/watch?v=abc123" + + result = detect_deletion(error_message=error, url=url) + + assert result is not None + assert result["is_deleted"] is True + assert result["platform"] == "youtube" + assert result["source"] == "error_message" + + def test_detect_deletion_in_video_metadata(self): + """Test detection via yt-dlp video metadata.""" + video_data = { + "availability": "unavailable", + "title": "Private video" + } + url = "https://youtube.com/watch?v=test123" + + result = detect_deletion(video_data=video_data, url=url) + + assert result is not None + assert result["is_deleted"] is True + assert result["source"] == "video_metadata" + assert "availability" in result["indicator"] + + def test_no_deletion_detected(self): + """Test that normal content is not flagged as deleted.""" + html = "

Welcome to my page

This is normal content.

" + title = "My Normal Page" + url = "https://example.com/page" + + result = detect_deletion( + html_content=html, + page_title=title, + url=url + ) + + assert result is None + + def test_instagram_media_not_found(self): + """Test Instagram-specific deletion message.""" + error = "Media not found or unavailable" + url = "https://instagram.com/p/ABC123" + + result = detect_deletion(error_message=error, url=url) + + assert result is not None + assert result["platform"] == "instagram" + assert "not found" in result["indicator"].lower() + + def test_reddit_removed_content(self): + """Test Reddit [removed] and [deleted] markers.""" + html = "
[removed]
" + url = "https://reddit.com/r/test/comments/abc123" + + result = detect_deletion(html_content=html, url=url) + + assert result is not None + assert result["platform"] == "reddit" + + +class TestFlagAsDeleted: + """Test the flag_as_deleted function.""" + + def test_flag_metadata_as_deleted(self): + """Verify that metadata is properly flagged with deletion info.""" + metadata = Metadata() + deletion_info = { + "is_deleted": True, + "indicator": "This Tweet is unavailable", + "source": "html_content", + "platform": "twitter" + } + + flag_as_deleted(metadata, deletion_info) + + assert metadata.get("deletion_detected") is True + assert metadata.get("deletion_indicator") == "This Tweet is unavailable" + assert metadata.get("deletion_source") == "html_content" + assert metadata.get("deletion_platform") == "twitter" + assert metadata.status == "deleted_or_unavailable" + + def test_metadata_contains_deletion_context(self): + """Verify investigators have full context about the deletion.""" + metadata = Metadata() + deletion_info = { + "is_deleted": True, + "indicator": "Video has been removed by the uploader", + "source": "error_message", + "platform": "youtube" + } + + flag_as_deleted(metadata, deletion_info) + + # This metadata can now be stored so investigators know: + # - The content existed but was deleted + # - Exactly what message indicated deletion + # - Which platform it was from + # - When it was checked (via _processed_at) + assert "deletion_indicator" in metadata.metadata + assert "uploader" in metadata.get("deletion_indicator") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])