""" Deletion Detection Utilities Provides comprehensive detection of deleted, missing, or unavailable content across various social media platforms. Critical for evidence preservation in conflict documentation and human rights investigations. This module helps investigators identify when content has been removed, allowing them to: - Document that evidence existed but was deleted - Track patterns of content removal - Preserve metadata about missing content """ from typing import Optional, Dict, List from auto_archiver.utils.custom_logger import logger class DeletionIndicators: """ Platform-specific indicators that content has been deleted or is unavailable. Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok, YouTube, Reddit, and VK. Used by conflict investigators to detect when evidence has been removed. """ # Twitter/X deletion indicators TWITTER = [ "Hmm...this page doesn't exist", "Try searching for something else", "This Tweet is unavailable", "This account doesn't exist", "This Tweet has been deleted", "This account has been suspended", "Sorry, that page doesn't exist", "The Tweet you're looking for isn't available", ] # Facebook deletion indicators FACEBOOK = [ "This content isn't available", "Sorry, this content isn't available", "This content is no longer available", "The link you followed may be broken", "Page Not Found", "Content Not Found", "This content is no longer on Facebook", ] # Instagram deletion indicators INSTAGRAM = [ "Sorry, this page isn't available", "The link you followed may be broken", "Media not found or unavailable", "This post is no longer available", "This account is private", ] # TikTok deletion indicators TIKTOK = [ "Couldn't find this account", "This video is no longer available", "This video is currently unavailable", "Video not found", "This video may have been deleted", ] # YouTube deletion indicators YOUTUBE = [ "This video isn't available anymore", "Video unavailable", "This video has been removed", "This video is no longer available", "This video is private", "This video has been removed by the uploader", "This video has been deleted", ] # Reddit deletion indicators REDDIT = [ "this post has been removed", "this comment has been removed", "[removed]", "[deleted]", "page not found", "there doesn't seem to be anything here", ] # VK deletion indicators VK = [ "Post deleted", "Page not found", "Content unavailable", "Access denied", ] # Telegram deletion indicators TELEGRAM = [ "Message not found", "Deleted message", "Channel is private", ] # Generic indicators (work across platforms) GENERIC = [ "404", "not found", "unavailable", "doesn't exist", "has been removed", "no longer available", "content removed", "access denied", "page not found", ] @classmethod def all_indicators(cls) -> List[str]: """Returns all deletion indicators from all platforms.""" return ( cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK + cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC ) @classmethod def for_url(cls, url: str) -> List[str]: """Returns platform-specific indicators based on URL domain.""" url_lower = url.lower() if "twitter.com" in url_lower or "x.com" in url_lower: return cls.TWITTER + cls.GENERIC elif "facebook.com" in url_lower or "fb.com" in url_lower: return cls.FACEBOOK + cls.GENERIC elif "instagram.com" in url_lower: return cls.INSTAGRAM + cls.GENERIC elif "tiktok.com" in url_lower: return cls.TIKTOK + cls.GENERIC elif "youtube.com" in url_lower or "youtu.be" in url_lower: return cls.YOUTUBE + cls.GENERIC elif "reddit.com" in url_lower: return cls.REDDIT + cls.GENERIC elif "vk.com" in url_lower: return cls.VK + cls.GENERIC elif "t.me" in url_lower: return cls.TELEGRAM + cls.GENERIC else: return cls.GENERIC def detect_deletion( html_content: str = None, page_title: str = None, error_message: str = None, url: str = None, video_data: dict = None ) -> Optional[Dict[str, any]]: """ Comprehensive deletion detection across multiple signals. Checks HTML content, page titles, error messages, and video metadata for indicators that content has been deleted or is unavailable. Args: html_content: Raw HTML source of the page page_title: Browser page title error_message: Any error message from the extractor url: The URL being archived (for platform-specific detection) video_data: Video metadata from yt-dlp or other extractors Returns: Dictionary with deletion details if detected, None otherwise. Format: { "is_deleted": True, "indicator": "specific text that was found", "source": "html|title|error|metadata", "platform": "twitter|facebook|etc" } """ # Determine indicators to check based on URL if url: indicators = DeletionIndicators.for_url(url) platform = _extract_platform(url) else: indicators = DeletionIndicators.all_indicators() platform = "unknown" # Check HTML content if html_content: for indicator in indicators: if indicator.lower() in html_content.lower(): logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}") return { "is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform } # Check page title if page_title: for indicator in indicators: if indicator.lower() in page_title.lower(): logger.info(f"Deletion detected in page title: '{indicator}' found for {url}") return { "is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform } # Check error messages if error_message: for indicator in indicators: if indicator.lower() in str(error_message).lower(): logger.info(f"Deletion detected in error: '{indicator}' found for {url}") return { "is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform } # Check video metadata (from yt-dlp) if video_data: # Check if yt-dlp flagged it as unavailable if video_data.get("availability") in ["unavailable", "private", "deleted"]: logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}") return { "is_deleted": True, "indicator": f"availability: {video_data.get('availability')}", "source": "video_metadata", "platform": platform } # Check description/title for deletion indicators for key in ["title", "description", "fulltitle"]: if key in video_data: for indicator in indicators: if indicator.lower() in str(video_data[key]).lower(): logger.info(f"Deletion detected in {key}: '{indicator}'") return { "is_deleted": True, "indicator": indicator, "source": f"video_metadata_{key}", "platform": platform } return None def _extract_platform(url: str) -> str: """Extracts platform name from URL.""" url_lower = url.lower() if "twitter.com" in url_lower or "x.com" in url_lower: return "twitter" elif "facebook.com" in url_lower or "fb.com" in url_lower: return "facebook" elif "instagram.com" in url_lower: return "instagram" elif "tiktok.com" in url_lower: return "tiktok" elif "youtube.com" in url_lower or "youtu.be" in url_lower: return "youtube" elif "reddit.com" in url_lower: return "reddit" elif "vk.com" in url_lower: return "vk" elif "t.me" in url_lower: return "telegram" else: return "unknown" def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None: """ Flags metadata object as deleted/unavailable. Adds detailed deletion information to the metadata object so investigators know exactly why and how the deletion was detected. Args: metadata: Metadata object to update deletion_info: Dictionary from detect_deletion() """ metadata.set("deletion_detected", True) metadata.set("deletion_indicator", deletion_info.get("indicator")) metadata.set("deletion_source", deletion_info.get("source")) metadata.set("deletion_platform", deletion_info.get("platform")) metadata.status = "deleted_or_unavailable" logger.warning( f"Content marked as deleted/unavailable: " f"platform={deletion_info.get('platform')}, " f"indicator='{deletion_info.get('indicator')}', " f"source={deletion_info.get('source')}" )