Files
auto-archiver/src/auto_archiver/utils/deletion_detection.py
m4cd4r4 d02e7e0f02 Add comprehensive deletion detection for removed/unavailable content
Implements issue #335: improve detection of deleted/missing posts

## Changes

### New Deletion Detection System
- Created `deletion_detection.py` utility module with platform-specific
  indicators for Twitter, Facebook, Instagram, TikTok, YouTube, Reddit,
  VK, and Telegram
- Detects deletion via HTML content, page titles, error messages, and
  video metadata
- Stores detailed deletion context (indicator, source, platform) in
  metadata for investigators

### Integration Points
- **Antibot Extractor**: Checks HTML and page titles after page load;
  resolves TODO about detecting deleted videos
- **Generic Extractor**: Checks yt-dlp video data and error messages
  for deletion indicators
- **Twitter Dropin**: Enhanced detection when user/created_at fields
  are missing

### Test Coverage
- Comprehensive test suite covering all platforms
- Tests for HTML, title, error message, and metadata detection
- Validates that normal content is not falsely flagged

## Impact for Conflict Documentation

This fix is critical for evidence preservation in war-torn regions:
- Investigators can now document that evidence existed but was deleted
- Prevents wasted archival attempts on deleted content
- Tracks patterns of content removal
- Preserves metadata about what was deleted and when

Twitter example: Detects "Hmm...this page doesn't exist. Try searching
for something else" and flags content as deleted_or_unavailable.
2025-12-17 18:40:58 +08:00

300 lines
9.8 KiB
Python

"""
Deletion Detection Utilities
Provides comprehensive detection of deleted, missing, or unavailable content
across various social media platforms. Critical for evidence preservation in
conflict documentation and human rights investigations.
This module helps investigators identify when content has been removed,
allowing them to:
- Document that evidence existed but was deleted
- Track patterns of content removal
- Preserve metadata about missing content
"""
from typing import Optional, Dict, List
from auto_archiver.utils.custom_logger import logger
class DeletionIndicators:
"""
Platform-specific indicators that content has been deleted or is unavailable.
Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok,
YouTube, Reddit, and VK. Used by conflict investigators to detect when
evidence has been removed.
"""
# Twitter/X deletion indicators
TWITTER = [
"Hmm...this page doesn't exist",
"Try searching for something else",
"This Tweet is unavailable",
"This account doesn't exist",
"This Tweet has been deleted",
"This account has been suspended",
"Sorry, that page doesn't exist",
"The Tweet you're looking for isn't available",
]
# Facebook deletion indicators
FACEBOOK = [
"This content isn't available",
"Sorry, this content isn't available",
"This content is no longer available",
"The link you followed may be broken",
"Page Not Found",
"Content Not Found",
"This content is no longer on Facebook",
]
# Instagram deletion indicators
INSTAGRAM = [
"Sorry, this page isn't available",
"The link you followed may be broken",
"Media not found or unavailable",
"This post is no longer available",
"This account is private",
]
# TikTok deletion indicators
TIKTOK = [
"Couldn't find this account",
"This video is no longer available",
"This video is currently unavailable",
"Video not found",
"This video may have been deleted",
]
# YouTube deletion indicators
YOUTUBE = [
"This video isn't available anymore",
"Video unavailable",
"This video has been removed",
"This video is no longer available",
"This video is private",
"This video has been removed by the uploader",
"This video has been deleted",
]
# Reddit deletion indicators
REDDIT = [
"this post has been removed",
"this comment has been removed",
"[removed]",
"[deleted]",
"page not found",
"there doesn't seem to be anything here",
]
# VK deletion indicators
VK = [
"Post deleted",
"Page not found",
"Content unavailable",
"Access denied",
]
# Telegram deletion indicators
TELEGRAM = [
"Message not found",
"Deleted message",
"Channel is private",
]
# Generic indicators (work across platforms)
GENERIC = [
"404",
"not found",
"unavailable",
"doesn't exist",
"has been removed",
"no longer available",
"content removed",
"access denied",
"page not found",
]
@classmethod
def all_indicators(cls) -> List[str]:
"""Returns all deletion indicators from all platforms."""
return (
cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK +
cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC
)
@classmethod
def for_url(cls, url: str) -> List[str]:
"""Returns platform-specific indicators based on URL domain."""
url_lower = url.lower()
if "twitter.com" in url_lower or "x.com" in url_lower:
return cls.TWITTER + cls.GENERIC
elif "facebook.com" in url_lower or "fb.com" in url_lower:
return cls.FACEBOOK + cls.GENERIC
elif "instagram.com" in url_lower:
return cls.INSTAGRAM + cls.GENERIC
elif "tiktok.com" in url_lower:
return cls.TIKTOK + cls.GENERIC
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
return cls.YOUTUBE + cls.GENERIC
elif "reddit.com" in url_lower:
return cls.REDDIT + cls.GENERIC
elif "vk.com" in url_lower:
return cls.VK + cls.GENERIC
elif "t.me" in url_lower:
return cls.TELEGRAM + cls.GENERIC
else:
return cls.GENERIC
def detect_deletion(
html_content: str = None,
page_title: str = None,
error_message: str = None,
url: str = None,
video_data: dict = None
) -> Optional[Dict[str, any]]:
"""
Comprehensive deletion detection across multiple signals.
Checks HTML content, page titles, error messages, and video metadata for
indicators that content has been deleted or is unavailable.
Args:
html_content: Raw HTML source of the page
page_title: Browser page title
error_message: Any error message from the extractor
url: The URL being archived (for platform-specific detection)
video_data: Video metadata from yt-dlp or other extractors
Returns:
Dictionary with deletion details if detected, None otherwise.
Format: {
"is_deleted": True,
"indicator": "specific text that was found",
"source": "html|title|error|metadata",
"platform": "twitter|facebook|etc"
}
"""
# Determine indicators to check based on URL
if url:
indicators = DeletionIndicators.for_url(url)
platform = _extract_platform(url)
else:
indicators = DeletionIndicators.all_indicators()
platform = "unknown"
# Check HTML content
if html_content:
for indicator in indicators:
if indicator.lower() in html_content.lower():
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "html_content",
"platform": platform
}
# Check page title
if page_title:
for indicator in indicators:
if indicator.lower() in page_title.lower():
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "page_title",
"platform": platform
}
# Check error messages
if error_message:
for indicator in indicators:
if indicator.lower() in str(error_message).lower():
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "error_message",
"platform": platform
}
# Check video metadata (from yt-dlp)
if video_data:
# Check if yt-dlp flagged it as unavailable
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
return {
"is_deleted": True,
"indicator": f"availability: {video_data.get('availability')}",
"source": "video_metadata",
"platform": platform
}
# Check description/title for deletion indicators
for key in ["title", "description", "fulltitle"]:
if key in video_data:
for indicator in indicators:
if indicator.lower() in str(video_data[key]).lower():
logger.info(f"Deletion detected in {key}: '{indicator}'")
return {
"is_deleted": True,
"indicator": indicator,
"source": f"video_metadata_{key}",
"platform": platform
}
return None
def _extract_platform(url: str) -> str:
"""Extracts platform name from URL."""
url_lower = url.lower()
if "twitter.com" in url_lower or "x.com" in url_lower:
return "twitter"
elif "facebook.com" in url_lower or "fb.com" in url_lower:
return "facebook"
elif "instagram.com" in url_lower:
return "instagram"
elif "tiktok.com" in url_lower:
return "tiktok"
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
return "youtube"
elif "reddit.com" in url_lower:
return "reddit"
elif "vk.com" in url_lower:
return "vk"
elif "t.me" in url_lower:
return "telegram"
else:
return "unknown"
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
"""
Flags metadata object as deleted/unavailable.
Adds detailed deletion information to the metadata object so investigators
know exactly why and how the deletion was detected.
Args:
metadata: Metadata object to update
deletion_info: Dictionary from detect_deletion()
"""
metadata.set("deletion_detected", True)
metadata.set("deletion_indicator", deletion_info.get("indicator"))
metadata.set("deletion_source", deletion_info.get("source"))
metadata.set("deletion_platform", deletion_info.get("platform"))
metadata.status = "deleted_or_unavailable"
logger.warning(
f"Content marked as deleted/unavailable: "
f"platform={deletion_info.get('platform')}, "
f"indicator='{deletion_info.get('indicator')}', "
f"source={deletion_info.get('source')}"
)