mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
Add comprehensive deletion detection for removed/unavailable content
Implements issue #335: improve detection of deleted/missing posts ## Changes ### New Deletion Detection System - Created `deletion_detection.py` utility module with platform-specific indicators for Twitter, Facebook, Instagram, TikTok, YouTube, Reddit, VK, and Telegram - Detects deletion via HTML content, page titles, error messages, and video metadata - Stores detailed deletion context (indicator, source, platform) in metadata for investigators ### Integration Points - **Antibot Extractor**: Checks HTML and page titles after page load; resolves TODO about detecting deleted videos - **Generic Extractor**: Checks yt-dlp video data and error messages for deletion indicators - **Twitter Dropin**: Enhanced detection when user/created_at fields are missing ### Test Coverage - Comprehensive test suite covering all platforms - Tests for HTML, title, error message, and metadata detection - Validates that normal content is not falsely flagged ## Impact for Conflict Documentation This fix is critical for evidence preservation in war-torn regions: - Investigators can now document that evidence existed but was deleted - Prevents wasted archival attempts on deleted content - Tracks patterns of content removal - Preserves metadata about what was deleted and when Twitter example: Detects "Hmm...this page doesn't exist. Try searching for something else" and flags content as deleted_or_unavailable.
This commit is contained in:
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.url import is_relevant_url
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
@@ -98,8 +99,18 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
|
||||
dropin = self._get_suitable_dropin(url, sb)
|
||||
if not dropin.open_page(url):
|
||||
# TODO: could we detect deleted videos?
|
||||
logger.warning("Failed to open drop-in page")
|
||||
# Check for deletion indicators
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
deletion_info = detect_deletion(
|
||||
html_content=html_source,
|
||||
page_title=page_title,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
return to_enrich
|
||||
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||
return False
|
||||
|
||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||
@@ -109,7 +120,19 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
sb.wait_for_ready_state_complete()
|
||||
sb.sleep(1) # margin for the page to load completely
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
|
||||
# Check if the page indicates content was deleted
|
||||
deletion_info = detect_deletion(
|
||||
html_content=html_source,
|
||||
page_title=page_title,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
|
||||
to_enrich.set_title(page_title)
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
|
||||
@@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import get_datetime_from_str
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
@@ -484,6 +485,16 @@ class GenericExtractor(Extractor):
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
|
||||
# Check for deletion indicators in video data
|
||||
deletion_info = detect_deletion(
|
||||
video_data=data,
|
||||
url=url
|
||||
)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
@@ -503,6 +514,13 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
# Check if the error indicates deletion
|
||||
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
if "NSFW tweet requires authentication." in str(post_e):
|
||||
logger.warning(str(post_e))
|
||||
return False
|
||||
|
||||
@@ -7,6 +7,7 @@ from slugify import slugify
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
@@ -37,7 +38,17 @@ class Twitter(GenericDropin):
|
||||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
# Check for deletion indicators
|
||||
deletion_info = detect_deletion(
|
||||
video_data=tweet,
|
||||
url=url,
|
||||
error_message="Missing user or created_at fields"
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
|
||||
Reference in New Issue
Block a user