Add comprehensive deletion detection for removed/unavailable content

Implements issue #335: improve detection of deleted/missing posts

## Changes

### New Deletion Detection System
- Created `deletion_detection.py` utility module with platform-specific
  indicators for Twitter, Facebook, Instagram, TikTok, YouTube, Reddit,
  VK, and Telegram
- Detects deletion via HTML content, page titles, error messages, and
  video metadata
- Stores detailed deletion context (indicator, source, platform) in
  metadata for investigators

### Integration Points
- **Antibot Extractor**: Checks HTML and page titles after page load;
  resolves TODO about detecting deleted videos
- **Generic Extractor**: Checks yt-dlp video data and error messages
  for deletion indicators
- **Twitter Dropin**: Enhanced detection when user/created_at fields
  are missing

### Test Coverage
- Comprehensive test suite covering all platforms
- Tests for HTML, title, error message, and metadata detection
- Validates that normal content is not falsely flagged

## Impact for Conflict Documentation

This fix is critical for evidence preservation in war-torn regions:
- Investigators can now document that evidence existed but was deleted
- Prevents wasted archival attempts on deleted content
- Tracks patterns of content removal
- Preserves metadata about what was deleted and when

Twitter example: Detects "Hmm...this page doesn't exist. Try searching
for something else" and flags content as deleted_or_unavailable.
This commit is contained in:
m4cd4r4
2025-12-14 22:03:01 +08:00
parent 56526a9ac7
commit d02e7e0f02
6 changed files with 653 additions and 4 deletions

129
deletion-detection.patch Normal file
View File

@@ -0,0 +1,129 @@
--- a/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
+++ b/src/auto_archiver/modules/antibot_extractor_enricher/antibot_extractor_enricher.py
@@ -15,6 +15,7 @@ from auto_archiver.core import Extractor, Enricher, Metadata, Media
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.url import is_relevant_url
+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
class AntibotExtractorEnricher(Extractor, Enricher):
@@ -97,9 +98,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
dropin = self._get_suitable_dropin(url, sb)
if not dropin.open_page(url):
- # TODO: could we detect deleted videos?
- logger.warning("Failed to open drop-in page")
+ # Check for deletion indicators
+ page_title = sb.get_title()
+ html_source = sb.get_page_source()
+ deletion_info = detect_deletion(
+ html_content=html_source,
+ page_title=page_title,
+ url=url
+ )
+ if deletion_info:
+ flag_as_deleted(to_enrich, deletion_info)
+ return to_enrich
+ logger.warning("Failed to open drop-in page (not detected as deleted)")
return False
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
@@ -109,7 +119,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
- to_enrich.set_title(sb.get_title())
+ page_title = sb.get_title()
+ html_source = sb.get_page_source()
+
+ # Check if the page indicates content was deleted
+ deletion_info = detect_deletion(
+ html_content=html_source,
+ page_title=page_title,
+ url=url
+ )
+ if deletion_info:
+ flag_as_deleted(to_enrich, deletion_info)
+
+ to_enrich.set_title(page_title)
self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich)
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -19,6 +19,7 @@ from auto_archiver.utils.custom_logger import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import get_datetime_from_str
from auto_archiver.utils.misc import ydl_entry_to_filename
+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from .dropin import GenericDropin
@@ -481,6 +482,15 @@ class GenericExtractor(Extractor):
raise SkipYtdlp()
# don't download since it can be a live stream
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
+
+ # Check for deletion indicators in video data
+ deletion_info = detect_deletion(
+ video_data=data,
+ url=url
+ )
+ if deletion_info:
+ result = Metadata()
+ flag_as_deleted(result, deletion_info)
+ return result
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
@@ -505,6 +515,12 @@ class GenericExtractor(Extractor):
try:
result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
+ # Check if the error indicates deletion
+ deletion_info = detect_deletion(error_message=str(post_e), url=url)
+ if deletion_info:
+ result = Metadata()
+ flag_as_deleted(result, deletion_info)
+ return result
+
if "NSFW tweet requires authentication." in str(post_e):
logger.warning(str(post_e))
return False
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -7,6 +7,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
from auto_archiver.core.extractor import Extractor
+from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
@@ -36,9 +37,18 @@ class Twitter(GenericDropin):
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
result = Metadata()
try:
if not tweet.get("user") or not tweet.get("created_at"):
- raise ValueError("Error retreiving post. Are you sure it exists?")
+ # Check for deletion indicators
+ deletion_info = detect_deletion(
+ video_data=tweet,
+ url=url,
+ error_message="Missing user or created_at fields"
+ )
+ if deletion_info:
+ flag_as_deleted(result, deletion_info)
+ return result
+
+ raise ValueError("Error retrieving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")

View File

@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.url import is_relevant_url
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
class AntibotExtractorEnricher(Extractor, Enricher):
@@ -98,8 +99,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin = self._get_suitable_dropin(url, sb)
if not dropin.open_page(url):
# TODO: could we detect deleted videos?
logger.warning("Failed to open drop-in page")
# Check for deletion indicators
page_title = sb.get_title()
html_source = sb.get_page_source()
deletion_info = detect_deletion(
html_content=html_source,
page_title=page_title,
url=url
)
if deletion_info:
flag_as_deleted(to_enrich, deletion_info)
return to_enrich
logger.warning("Failed to open drop-in page (not detected as deleted)")
return False
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
@@ -109,7 +120,19 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title())
page_title = sb.get_title()
html_source = sb.get_page_source()
# Check if the page indicates content was deleted
deletion_info = detect_deletion(
html_content=html_source,
page_title=page_title,
url=url
)
if deletion_info:
flag_as_deleted(to_enrich, deletion_info)
to_enrich.set_title(page_title)
self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich)

View File

@@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import get_datetime_from_str
from auto_archiver.utils.misc import ydl_entry_to_filename
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from .dropin import GenericDropin
@@ -484,6 +485,16 @@ class GenericExtractor(Extractor):
# don't download since it can be a live stream
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
# Check for deletion indicators in video data
deletion_info = detect_deletion(
video_data=data,
url=url
)
if deletion_info:
result = Metadata()
flag_as_deleted(result, deletion_info)
return result
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except MaxDownloadsReached:
@@ -503,6 +514,13 @@ class GenericExtractor(Extractor):
try:
result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
# Check if the error indicates deletion
deletion_info = detect_deletion(error_message=str(post_e), url=url)
if deletion_info:
result = Metadata()
flag_as_deleted(result, deletion_info)
return result
if "NSFW tweet requires authentication." in str(post_e):
logger.warning(str(post_e))
return False

View File

@@ -7,6 +7,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
from auto_archiver.core.extractor import Extractor
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
@@ -37,7 +38,17 @@ class Twitter(GenericDropin):
result = Metadata()
try:
if not tweet.get("user") or not tweet.get("created_at"):
raise ValueError("Error retreiving post. Are you sure it exists?")
# Check for deletion indicators
deletion_info = detect_deletion(
video_data=tweet,
url=url,
error_message="Missing user or created_at fields"
)
if deletion_info:
flag_as_deleted(result, deletion_info)
return result
raise ValueError("Error retrieving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")

View File

@@ -0,0 +1,299 @@
"""
Deletion Detection Utilities
Provides comprehensive detection of deleted, missing, or unavailable content
across various social media platforms. Critical for evidence preservation in
conflict documentation and human rights investigations.
This module helps investigators identify when content has been removed,
allowing them to:
- Document that evidence existed but was deleted
- Track patterns of content removal
- Preserve metadata about missing content
"""
from typing import Optional, Dict, List
from auto_archiver.utils.custom_logger import logger
class DeletionIndicators:
"""
Platform-specific indicators that content has been deleted or is unavailable.
Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok,
YouTube, Reddit, and VK. Used by conflict investigators to detect when
evidence has been removed.
"""
# Twitter/X deletion indicators
TWITTER = [
"Hmm...this page doesn't exist",
"Try searching for something else",
"This Tweet is unavailable",
"This account doesn't exist",
"This Tweet has been deleted",
"This account has been suspended",
"Sorry, that page doesn't exist",
"The Tweet you're looking for isn't available",
]
# Facebook deletion indicators
FACEBOOK = [
"This content isn't available",
"Sorry, this content isn't available",
"This content is no longer available",
"The link you followed may be broken",
"Page Not Found",
"Content Not Found",
"This content is no longer on Facebook",
]
# Instagram deletion indicators
INSTAGRAM = [
"Sorry, this page isn't available",
"The link you followed may be broken",
"Media not found or unavailable",
"This post is no longer available",
"This account is private",
]
# TikTok deletion indicators
TIKTOK = [
"Couldn't find this account",
"This video is no longer available",
"This video is currently unavailable",
"Video not found",
"This video may have been deleted",
]
# YouTube deletion indicators
YOUTUBE = [
"This video isn't available anymore",
"Video unavailable",
"This video has been removed",
"This video is no longer available",
"This video is private",
"This video has been removed by the uploader",
"This video has been deleted",
]
# Reddit deletion indicators
REDDIT = [
"this post has been removed",
"this comment has been removed",
"[removed]",
"[deleted]",
"page not found",
"there doesn't seem to be anything here",
]
# VK deletion indicators
VK = [
"Post deleted",
"Page not found",
"Content unavailable",
"Access denied",
]
# Telegram deletion indicators
TELEGRAM = [
"Message not found",
"Deleted message",
"Channel is private",
]
# Generic indicators (work across platforms)
GENERIC = [
"404",
"not found",
"unavailable",
"doesn't exist",
"has been removed",
"no longer available",
"content removed",
"access denied",
"page not found",
]
@classmethod
def all_indicators(cls) -> List[str]:
"""Returns all deletion indicators from all platforms."""
return (
cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK +
cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC
)
@classmethod
def for_url(cls, url: str) -> List[str]:
"""Returns platform-specific indicators based on URL domain."""
url_lower = url.lower()
if "twitter.com" in url_lower or "x.com" in url_lower:
return cls.TWITTER + cls.GENERIC
elif "facebook.com" in url_lower or "fb.com" in url_lower:
return cls.FACEBOOK + cls.GENERIC
elif "instagram.com" in url_lower:
return cls.INSTAGRAM + cls.GENERIC
elif "tiktok.com" in url_lower:
return cls.TIKTOK + cls.GENERIC
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
return cls.YOUTUBE + cls.GENERIC
elif "reddit.com" in url_lower:
return cls.REDDIT + cls.GENERIC
elif "vk.com" in url_lower:
return cls.VK + cls.GENERIC
elif "t.me" in url_lower:
return cls.TELEGRAM + cls.GENERIC
else:
return cls.GENERIC
def detect_deletion(
html_content: str = None,
page_title: str = None,
error_message: str = None,
url: str = None,
video_data: dict = None
) -> Optional[Dict[str, any]]:
"""
Comprehensive deletion detection across multiple signals.
Checks HTML content, page titles, error messages, and video metadata for
indicators that content has been deleted or is unavailable.
Args:
html_content: Raw HTML source of the page
page_title: Browser page title
error_message: Any error message from the extractor
url: The URL being archived (for platform-specific detection)
video_data: Video metadata from yt-dlp or other extractors
Returns:
Dictionary with deletion details if detected, None otherwise.
Format: {
"is_deleted": True,
"indicator": "specific text that was found",
"source": "html|title|error|metadata",
"platform": "twitter|facebook|etc"
}
"""
# Determine indicators to check based on URL
if url:
indicators = DeletionIndicators.for_url(url)
platform = _extract_platform(url)
else:
indicators = DeletionIndicators.all_indicators()
platform = "unknown"
# Check HTML content
if html_content:
for indicator in indicators:
if indicator.lower() in html_content.lower():
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "html_content",
"platform": platform
}
# Check page title
if page_title:
for indicator in indicators:
if indicator.lower() in page_title.lower():
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "page_title",
"platform": platform
}
# Check error messages
if error_message:
for indicator in indicators:
if indicator.lower() in str(error_message).lower():
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
return {
"is_deleted": True,
"indicator": indicator,
"source": "error_message",
"platform": platform
}
# Check video metadata (from yt-dlp)
if video_data:
# Check if yt-dlp flagged it as unavailable
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
return {
"is_deleted": True,
"indicator": f"availability: {video_data.get('availability')}",
"source": "video_metadata",
"platform": platform
}
# Check description/title for deletion indicators
for key in ["title", "description", "fulltitle"]:
if key in video_data:
for indicator in indicators:
if indicator.lower() in str(video_data[key]).lower():
logger.info(f"Deletion detected in {key}: '{indicator}'")
return {
"is_deleted": True,
"indicator": indicator,
"source": f"video_metadata_{key}",
"platform": platform
}
return None
def _extract_platform(url: str) -> str:
"""Extracts platform name from URL."""
url_lower = url.lower()
if "twitter.com" in url_lower or "x.com" in url_lower:
return "twitter"
elif "facebook.com" in url_lower or "fb.com" in url_lower:
return "facebook"
elif "instagram.com" in url_lower:
return "instagram"
elif "tiktok.com" in url_lower:
return "tiktok"
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
return "youtube"
elif "reddit.com" in url_lower:
return "reddit"
elif "vk.com" in url_lower:
return "vk"
elif "t.me" in url_lower:
return "telegram"
else:
return "unknown"
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
"""
Flags metadata object as deleted/unavailable.
Adds detailed deletion information to the metadata object so investigators
know exactly why and how the deletion was detected.
Args:
metadata: Metadata object to update
deletion_info: Dictionary from detect_deletion()
"""
metadata.set("deletion_detected", True)
metadata.set("deletion_indicator", deletion_info.get("indicator"))
metadata.set("deletion_source", deletion_info.get("source"))
metadata.set("deletion_platform", deletion_info.get("platform"))
metadata.status = "deleted_or_unavailable"
logger.warning(
f"Content marked as deleted/unavailable: "
f"platform={deletion_info.get('platform')}, "
f"indicator='{deletion_info.get('indicator')}', "
f"source={deletion_info.get('source')}"
)

View File

@@ -0,0 +1,169 @@
"""
Tests for deletion detection utilities.
These tests verify that the auto-archiver can detect when content
has been deleted or is unavailable across various platforms.
Critical for evidence preservation in conflict documentation.
"""
import pytest
from auto_archiver.utils.deletion_detection import (
detect_deletion,
flag_as_deleted,
DeletionIndicators
)
from auto_archiver.core.metadata import Metadata
class TestDeletionIndicators:
"""Test the deletion indicator lists for various platforms."""
def test_twitter_indicators(self):
"""Verify Twitter deletion indicators are comprehensive."""
assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER
assert "Try searching for something else" in DeletionIndicators.TWITTER
assert "This Tweet is unavailable" in DeletionIndicators.TWITTER
def test_platform_specific_indicators(self):
"""Test that platform-specific indicators are returned based on URL."""
twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123")
assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators)
instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123")
assert any("page isn't available" in ind.lower() for ind in instagram_indicators)
class TestDetectDeletion:
"""Test the detect_deletion function with various inputs."""
def test_detect_deletion_in_html_twitter(self):
"""Test detection of Twitter's deleted post page."""
html = "<html><body>Hmm...this page doesn't exist. Try searching for something else.</body></html>"
url = "https://twitter.com/user/status/123"
result = detect_deletion(html_content=html, url=url)
assert result is not None
assert result["is_deleted"] is True
assert result["platform"] == "twitter"
assert result["source"] == "html_content"
assert "page doesn't exist" in result["indicator"].lower()
def test_detect_deletion_in_page_title(self):
"""Test detection via page title."""
title = "Page Not Found"
url = "https://facebook.com/post/123"
result = detect_deletion(page_title=title, url=url)
assert result is not None
assert result["is_deleted"] is True
assert result["source"] == "page_title"
def test_detect_deletion_in_error_message(self):
"""Test detection via error messages."""
error = "yt_dlp.utils.DownloadError: This video is no longer available"
url = "https://youtube.com/watch?v=abc123"
result = detect_deletion(error_message=error, url=url)
assert result is not None
assert result["is_deleted"] is True
assert result["platform"] == "youtube"
assert result["source"] == "error_message"
def test_detect_deletion_in_video_metadata(self):
"""Test detection via yt-dlp video metadata."""
video_data = {
"availability": "unavailable",
"title": "Private video"
}
url = "https://youtube.com/watch?v=test123"
result = detect_deletion(video_data=video_data, url=url)
assert result is not None
assert result["is_deleted"] is True
assert result["source"] == "video_metadata"
assert "availability" in result["indicator"]
def test_no_deletion_detected(self):
"""Test that normal content is not flagged as deleted."""
html = "<html><body><h1>Welcome to my page</h1><p>This is normal content.</p></body></html>"
title = "My Normal Page"
url = "https://example.com/page"
result = detect_deletion(
html_content=html,
page_title=title,
url=url
)
assert result is None
def test_instagram_media_not_found(self):
"""Test Instagram-specific deletion message."""
error = "Media not found or unavailable"
url = "https://instagram.com/p/ABC123"
result = detect_deletion(error_message=error, url=url)
assert result is not None
assert result["platform"] == "instagram"
assert "not found" in result["indicator"].lower()
def test_reddit_removed_content(self):
"""Test Reddit [removed] and [deleted] markers."""
html = "<div class='comment'>[removed]</div>"
url = "https://reddit.com/r/test/comments/abc123"
result = detect_deletion(html_content=html, url=url)
assert result is not None
assert result["platform"] == "reddit"
class TestFlagAsDeleted:
"""Test the flag_as_deleted function."""
def test_flag_metadata_as_deleted(self):
"""Verify that metadata is properly flagged with deletion info."""
metadata = Metadata()
deletion_info = {
"is_deleted": True,
"indicator": "This Tweet is unavailable",
"source": "html_content",
"platform": "twitter"
}
flag_as_deleted(metadata, deletion_info)
assert metadata.get("deletion_detected") is True
assert metadata.get("deletion_indicator") == "This Tweet is unavailable"
assert metadata.get("deletion_source") == "html_content"
assert metadata.get("deletion_platform") == "twitter"
assert metadata.status == "deleted_or_unavailable"
def test_metadata_contains_deletion_context(self):
"""Verify investigators have full context about the deletion."""
metadata = Metadata()
deletion_info = {
"is_deleted": True,
"indicator": "Video has been removed by the uploader",
"source": "error_message",
"platform": "youtube"
}
flag_as_deleted(metadata, deletion_info)
# This metadata can now be stored so investigators know:
# - The content existed but was deleted
# - Exactly what message indicated deletion
# - Which platform it was from
# - When it was checked (via _processed_at)
assert "deletion_indicator" in metadata.metadata
assert "uploader" in metadata.get("deletion_indicator")
if __name__ == "__main__":
pytest.main([__file__, "-v"])