Add comprehensive deletion detection for removed/unavailable content

Implements issue #335: improve detection of deleted/missing posts

## Changes

### New Deletion Detection System
- Created `deletion_detection.py` utility module with platform-specific
  indicators for Twitter, Facebook, Instagram, TikTok, YouTube, Reddit,
  VK, and Telegram
- Detects deletion via HTML content, page titles, error messages, and
  video metadata
- Stores detailed deletion context (indicator, source, platform) in
  metadata for investigators

### Integration Points
- **Antibot Extractor**: Checks HTML and page titles after page load;
  resolves TODO about detecting deleted videos
- **Generic Extractor**: Checks yt-dlp video data and error messages
  for deletion indicators
- **Twitter Dropin**: Enhanced detection when user/created_at fields
  are missing

### Test Coverage
- Comprehensive test suite covering all platforms
- Tests for HTML, title, error message, and metadata detection
- Validates that normal content is not falsely flagged

## Impact for Conflict Documentation

This fix is critical for evidence preservation in war-torn regions:
- Investigators can now document that evidence existed but was deleted
- Prevents wasted archival attempts on deleted content
- Tracks patterns of content removal
- Preserves metadata about what was deleted and when

Twitter example: Detects "Hmm...this page doesn't exist. Try searching
for something else" and flags content as deleted_or_unavailable.
This commit is contained in:
m4cd4r4
2025-12-14 22:03:01 +08:00
parent 56526a9ac7
commit d02e7e0f02
6 changed files with 653 additions and 4 deletions

View File

@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.url import is_relevant_url
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
class AntibotExtractorEnricher(Extractor, Enricher):
@@ -98,8 +99,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
dropin = self._get_suitable_dropin(url, sb)
if not dropin.open_page(url):
# TODO: could we detect deleted videos?
logger.warning("Failed to open drop-in page")
# Check for deletion indicators
page_title = sb.get_title()
html_source = sb.get_page_source()
deletion_info = detect_deletion(
html_content=html_source,
page_title=page_title,
url=url
)
if deletion_info:
flag_as_deleted(to_enrich, deletion_info)
return to_enrich
logger.warning("Failed to open drop-in page (not detected as deleted)")
return False
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
@@ -109,7 +120,19 @@ class AntibotExtractorEnricher(Extractor, Enricher):
sb.wait_for_ready_state_complete()
sb.sleep(1) # margin for the page to load completely
to_enrich.set_title(sb.get_title())
page_title = sb.get_title()
html_source = sb.get_page_source()
# Check if the page indicates content was deleted
deletion_info = detect_deletion(
html_content=html_source,
page_title=page_title,
url=url
)
if deletion_info:
flag_as_deleted(to_enrich, deletion_info)
to_enrich.set_title(page_title)
self._enrich_html_source_code(sb, to_enrich)
self._enrich_full_page_screenshot(sb, to_enrich)

View File

@@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import get_datetime_from_str
from auto_archiver.utils.misc import ydl_entry_to_filename
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from .dropin import GenericDropin
@@ -484,6 +485,16 @@ class GenericExtractor(Extractor):
# don't download since it can be a live stream
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
# Check for deletion indicators in video data
deletion_info = detect_deletion(
video_data=data,
url=url
)
if deletion_info:
result = Metadata()
flag_as_deleted(result, deletion_info)
return result
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
except MaxDownloadsReached:
@@ -503,6 +514,13 @@ class GenericExtractor(Extractor):
try:
result = self.get_metadata_for_post(info_extractor, url, ydl)
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
# Check if the error indicates deletion
deletion_info = detect_deletion(error_message=str(post_e), url=url)
if deletion_info:
result = Metadata()
flag_as_deleted(result, deletion_info)
return result
if "NSFW tweet requires authentication." in str(post_e):
logger.warning(str(post_e))
return False

View File

@@ -7,6 +7,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
from auto_archiver.core.extractor import Extractor
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
@@ -37,7 +38,17 @@ class Twitter(GenericDropin):
result = Metadata()
try:
if not tweet.get("user") or not tweet.get("created_at"):
raise ValueError("Error retreiving post. Are you sure it exists?")
# Check for deletion indicators
deletion_info = detect_deletion(
video_data=tweet,
url=url,
error_message="Missing user or created_at fields"
)
if deletion_info:
flag_as_deleted(result, deletion_info)
return result
raise ValueError("Error retrieving post. Are you sure it exists?")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")