mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Add comprehensive deletion detection for removed/unavailable content
Implements issue #335: improve detection of deleted/missing posts ## Changes ### New Deletion Detection System - Created `deletion_detection.py` utility module with platform-specific indicators for Twitter, Facebook, Instagram, TikTok, YouTube, Reddit, VK, and Telegram - Detects deletion via HTML content, page titles, error messages, and video metadata - Stores detailed deletion context (indicator, source, platform) in metadata for investigators ### Integration Points - **Antibot Extractor**: Checks HTML and page titles after page load; resolves TODO about detecting deleted videos - **Generic Extractor**: Checks yt-dlp video data and error messages for deletion indicators - **Twitter Dropin**: Enhanced detection when user/created_at fields are missing ### Test Coverage - Comprehensive test suite covering all platforms - Tests for HTML, title, error message, and metadata detection - Validates that normal content is not falsely flagged ## Impact for Conflict Documentation This fix is critical for evidence preservation in war-torn regions: - Investigators can now document that evidence existed but was deleted - Prevents wasted archival attempts on deleted content - Tracks patterns of content removal - Preserves metadata about what was deleted and when Twitter example: Detects "Hmm...this page doesn't exist. Try searching for something else" and flags content as deleted_or_unavailable.
This commit is contained in:
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.url import is_relevant_url
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
@@ -98,8 +99,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
|
||||
dropin = self._get_suitable_dropin(url, sb)
|
||||
if not dropin.open_page(url):
|
||||
# TODO: could we detect deleted videos?
|
||||
logger.warning("Failed to open drop-in page")
|
||||
# Check for deletion indicators
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
deletion_info = detect_deletion(
|
||||
html_content=html_source,
|
||||
page_title=page_title,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
return to_enrich
|
||||
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||
return False
|
||||
|
||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||
@@ -109,7 +120,19 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
sb.wait_for_ready_state_complete()
|
||||
sb.sleep(1) # margin for the page to load completely
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
|
||||
# Check if the page indicates content was deleted
|
||||
deletion_info = detect_deletion(
|
||||
html_content=html_source,
|
||||
page_title=page_title,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
|
||||
to_enrich.set_title(page_title)
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
|
||||
@@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import get_datetime_from_str
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
@@ -484,6 +485,16 @@ class GenericExtractor(Extractor):
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
|
||||
# Check for deletion indicators in video data
|
||||
deletion_info = detect_deletion(
|
||||
video_data=data,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
@@ -503,6 +514,13 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
# Check if the error indicates deletion
|
||||
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
if "NSFW tweet requires authentication." in str(post_e):
|
||||
logger.warning(str(post_e))
|
||||
return False
|
||||
|
||||
@@ -7,6 +7,7 @@ from slugify import slugify
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
@@ -37,7 +38,17 @@ class Twitter(GenericDropin):
|
||||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
# Check for deletion indicators
|
||||
deletion_info = detect_deletion(
|
||||
video_data=tweet,
|
||||
url=url,
|
||||
error_message="Missing user or created_at fields"
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
|
||||
299
src/auto_archiver/utils/deletion_detection.py
Normal file
299
src/auto_archiver/utils/deletion_detection.py
Normal file
@@ -0,0 +1,299 @@
|
||||
"""
|
||||
Deletion Detection Utilities
|
||||
|
||||
Provides comprehensive detection of deleted, missing, or unavailable content
|
||||
across various social media platforms. Critical for evidence preservation in
|
||||
conflict documentation and human rights investigations.
|
||||
|
||||
This module helps investigators identify when content has been removed,
|
||||
allowing them to:
|
||||
- Document that evidence existed but was deleted
|
||||
- Track patterns of content removal
|
||||
- Preserve metadata about missing content
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, List
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
|
||||
class DeletionIndicators:
|
||||
"""
|
||||
Platform-specific indicators that content has been deleted or is unavailable.
|
||||
|
||||
Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok,
|
||||
YouTube, Reddit, and VK. Used by conflict investigators to detect when
|
||||
evidence has been removed.
|
||||
"""
|
||||
|
||||
# Twitter/X deletion indicators
|
||||
TWITTER = [
|
||||
"Hmm...this page doesn't exist",
|
||||
"Try searching for something else",
|
||||
"This Tweet is unavailable",
|
||||
"This account doesn't exist",
|
||||
"This Tweet has been deleted",
|
||||
"This account has been suspended",
|
||||
"Sorry, that page doesn't exist",
|
||||
"The Tweet you're looking for isn't available",
|
||||
]
|
||||
|
||||
# Facebook deletion indicators
|
||||
FACEBOOK = [
|
||||
"This content isn't available",
|
||||
"Sorry, this content isn't available",
|
||||
"This content is no longer available",
|
||||
"The link you followed may be broken",
|
||||
"Page Not Found",
|
||||
"Content Not Found",
|
||||
"This content is no longer on Facebook",
|
||||
]
|
||||
|
||||
# Instagram deletion indicators
|
||||
INSTAGRAM = [
|
||||
"Sorry, this page isn't available",
|
||||
"The link you followed may be broken",
|
||||
"Media not found or unavailable",
|
||||
"This post is no longer available",
|
||||
"This account is private",
|
||||
]
|
||||
|
||||
# TikTok deletion indicators
|
||||
TIKTOK = [
|
||||
"Couldn't find this account",
|
||||
"This video is no longer available",
|
||||
"This video is currently unavailable",
|
||||
"Video not found",
|
||||
"This video may have been deleted",
|
||||
]
|
||||
|
||||
# YouTube deletion indicators
|
||||
YOUTUBE = [
|
||||
"This video isn't available anymore",
|
||||
"Video unavailable",
|
||||
"This video has been removed",
|
||||
"This video is no longer available",
|
||||
"This video is private",
|
||||
"This video has been removed by the uploader",
|
||||
"This video has been deleted",
|
||||
]
|
||||
|
||||
# Reddit deletion indicators
|
||||
REDDIT = [
|
||||
"this post has been removed",
|
||||
"this comment has been removed",
|
||||
"[removed]",
|
||||
"[deleted]",
|
||||
"page not found",
|
||||
"there doesn't seem to be anything here",
|
||||
]
|
||||
|
||||
# VK deletion indicators
|
||||
VK = [
|
||||
"Post deleted",
|
||||
"Page not found",
|
||||
"Content unavailable",
|
||||
"Access denied",
|
||||
]
|
||||
|
||||
# Telegram deletion indicators
|
||||
TELEGRAM = [
|
||||
"Message not found",
|
||||
"Deleted message",
|
||||
"Channel is private",
|
||||
]
|
||||
|
||||
# Generic indicators (work across platforms)
|
||||
GENERIC = [
|
||||
"404",
|
||||
"not found",
|
||||
"unavailable",
|
||||
"doesn't exist",
|
||||
"has been removed",
|
||||
"no longer available",
|
||||
"content removed",
|
||||
"access denied",
|
||||
"page not found",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def all_indicators(cls) -> List[str]:
|
||||
"""Returns all deletion indicators from all platforms."""
|
||||
return (
|
||||
cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK +
|
||||
cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_url(cls, url: str) -> List[str]:
|
||||
"""Returns platform-specific indicators based on URL domain."""
|
||||
url_lower = url.lower()
|
||||
|
||||
if "twitter.com" in url_lower or "x.com" in url_lower:
|
||||
return cls.TWITTER + cls.GENERIC
|
||||
elif "facebook.com" in url_lower or "fb.com" in url_lower:
|
||||
return cls.FACEBOOK + cls.GENERIC
|
||||
elif "instagram.com" in url_lower:
|
||||
return cls.INSTAGRAM + cls.GENERIC
|
||||
elif "tiktok.com" in url_lower:
|
||||
return cls.TIKTOK + cls.GENERIC
|
||||
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
|
||||
return cls.YOUTUBE + cls.GENERIC
|
||||
elif "reddit.com" in url_lower:
|
||||
return cls.REDDIT + cls.GENERIC
|
||||
elif "vk.com" in url_lower:
|
||||
return cls.VK + cls.GENERIC
|
||||
elif "t.me" in url_lower:
|
||||
return cls.TELEGRAM + cls.GENERIC
|
||||
else:
|
||||
return cls.GENERIC
|
||||
|
||||
|
||||
def detect_deletion(
|
||||
html_content: str = None,
|
||||
page_title: str = None,
|
||||
error_message: str = None,
|
||||
url: str = None,
|
||||
video_data: dict = None
|
||||
) -> Optional[Dict[str, any]]:
|
||||
"""
|
||||
Comprehensive deletion detection across multiple signals.
|
||||
|
||||
Checks HTML content, page titles, error messages, and video metadata for
|
||||
indicators that content has been deleted or is unavailable.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML source of the page
|
||||
page_title: Browser page title
|
||||
error_message: Any error message from the extractor
|
||||
url: The URL being archived (for platform-specific detection)
|
||||
video_data: Video metadata from yt-dlp or other extractors
|
||||
|
||||
Returns:
|
||||
Dictionary with deletion details if detected, None otherwise.
|
||||
Format: {
|
||||
"is_deleted": True,
|
||||
"indicator": "specific text that was found",
|
||||
"source": "html|title|error|metadata",
|
||||
"platform": "twitter|facebook|etc"
|
||||
}
|
||||
"""
|
||||
|
||||
# Determine indicators to check based on URL
|
||||
if url:
|
||||
indicators = DeletionIndicators.for_url(url)
|
||||
platform = _extract_platform(url)
|
||||
else:
|
||||
indicators = DeletionIndicators.all_indicators()
|
||||
platform = "unknown"
|
||||
|
||||
# Check HTML content
|
||||
if html_content:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in html_content.lower():
|
||||
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": "html_content",
|
||||
"platform": platform
|
||||
}
|
||||
|
||||
# Check page title
|
||||
if page_title:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in page_title.lower():
|
||||
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": "page_title",
|
||||
"platform": platform
|
||||
}
|
||||
|
||||
# Check error messages
|
||||
if error_message:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(error_message).lower():
|
||||
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": "error_message",
|
||||
"platform": platform
|
||||
}
|
||||
|
||||
# Check video metadata (from yt-dlp)
|
||||
if video_data:
|
||||
# Check if yt-dlp flagged it as unavailable
|
||||
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
|
||||
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": f"availability: {video_data.get('availability')}",
|
||||
"source": "video_metadata",
|
||||
"platform": platform
|
||||
}
|
||||
|
||||
# Check description/title for deletion indicators
|
||||
for key in ["title", "description", "fulltitle"]:
|
||||
if key in video_data:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(video_data[key]).lower():
|
||||
logger.info(f"Deletion detected in {key}: '{indicator}'")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": f"video_metadata_{key}",
|
||||
"platform": platform
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_platform(url: str) -> str:
|
||||
"""Extracts platform name from URL."""
|
||||
url_lower = url.lower()
|
||||
|
||||
if "twitter.com" in url_lower or "x.com" in url_lower:
|
||||
return "twitter"
|
||||
elif "facebook.com" in url_lower or "fb.com" in url_lower:
|
||||
return "facebook"
|
||||
elif "instagram.com" in url_lower:
|
||||
return "instagram"
|
||||
elif "tiktok.com" in url_lower:
|
||||
return "tiktok"
|
||||
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
|
||||
return "youtube"
|
||||
elif "reddit.com" in url_lower:
|
||||
return "reddit"
|
||||
elif "vk.com" in url_lower:
|
||||
return "vk"
|
||||
elif "t.me" in url_lower:
|
||||
return "telegram"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
|
||||
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||
"""
|
||||
Flags metadata object as deleted/unavailable.
|
||||
|
||||
Adds detailed deletion information to the metadata object so investigators
|
||||
know exactly why and how the deletion was detected.
|
||||
|
||||
Args:
|
||||
metadata: Metadata object to update
|
||||
deletion_info: Dictionary from detect_deletion()
|
||||
"""
|
||||
metadata.set("deletion_detected", True)
|
||||
metadata.set("deletion_indicator", deletion_info.get("indicator"))
|
||||
metadata.set("deletion_source", deletion_info.get("source"))
|
||||
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||
metadata.status = "deleted_or_unavailable"
|
||||
|
||||
logger.warning(
|
||||
f"Content marked as deleted/unavailable: "
|
||||
f"platform={deletion_info.get('platform')}, "
|
||||
f"indicator='{deletion_info.get('indicator')}', "
|
||||
f"source={deletion_info.get('source')}"
|
||||
)
|
||||
Reference in New Issue
Block a user