updates new utils file and test

This commit is contained in:
msramalho
2026-01-08 14:54:06 +00:00
parent 68f672a4fa
commit a936921c4e
2 changed files with 55 additions and 103 deletions

View File

@@ -1,28 +1,23 @@
""" """
Deletion Detection Utilities Deletion Detection Utilities
Provides comprehensive detection of deleted, missing, or unavailable content Provides a best-effort detection of deleted, missing, or unavailable content
across various social media platforms. Critical for evidence preservation in across various social media platforms based on presence of expected keywords.
conflict documentation and human rights investigations.
This module helps investigators identify when content has been removed, This module helps identify removed content, helps to:
allowing them to: - Document content that existed but was deleted
- Document that evidence existed but was deleted
- Track patterns of content removal - Track patterns of content removal
- Preserve metadata about missing content - Preserve metadata about missing content
""" """
from typing import Optional, Dict, List from typing import Optional, Dict, List
from auto_archiver.utils.custom_logger import logger from auto_archiver.utils.custom_logger import logger
from urllib.parse import urlparse
class DeletionIndicators: class DeletionIndicators:
""" """
Platform-specific indicators that content has been deleted or is unavailable. Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok,
YouTube, Reddit, and VK. Used by conflict investigators to detect when
evidence has been removed.
""" """
# Twitter/X deletion indicators # Twitter/X deletion indicators
@@ -104,10 +99,6 @@ class DeletionIndicators:
# Generic indicators (work across platforms) # Generic indicators (work across platforms)
GENERIC = [ GENERIC = [
"404",
"not found",
"unavailable",
"doesn't exist",
"has been removed", "has been removed",
"no longer available", "no longer available",
"content removed", "content removed",
@@ -119,33 +110,33 @@ class DeletionIndicators:
def all_indicators(cls) -> List[str]: def all_indicators(cls) -> List[str]:
"""Returns all deletion indicators from all platforms.""" """Returns all deletion indicators from all platforms."""
return ( return (
cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK + cls.TWITTER
cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC + cls.FACEBOOK
+ cls.INSTAGRAM
+ cls.TIKTOK
+ cls.YOUTUBE
+ cls.REDDIT
+ cls.VK
+ cls.TELEGRAM
+ cls.GENERIC
) )
@classmethod @classmethod
def for_url(cls, url: str) -> List[str]: def for_url(cls, url: str) -> List[str]:
"""Returns platform-specific indicators based on URL domain.""" """Returns platform-specific indicators based on URL domain."""
url_lower = url.lower() platform = _extract_platform(url)
if "twitter.com" in url_lower or "x.com" in url_lower: indicators_map = {
return cls.TWITTER + cls.GENERIC "twitter": cls.TWITTER + cls.GENERIC,
elif "facebook.com" in url_lower or "fb.com" in url_lower: "facebook": cls.FACEBOOK + cls.GENERIC,
return cls.FACEBOOK + cls.GENERIC "instagram": cls.INSTAGRAM + cls.GENERIC,
elif "instagram.com" in url_lower: "tiktok": cls.TIKTOK + cls.GENERIC,
return cls.INSTAGRAM + cls.GENERIC "youtube": cls.YOUTUBE + cls.GENERIC,
elif "tiktok.com" in url_lower: "reddit": cls.REDDIT + cls.GENERIC,
return cls.TIKTOK + cls.GENERIC "vk": cls.VK + cls.GENERIC,
elif "youtube.com" in url_lower or "youtu.be" in url_lower: "telegram": cls.TELEGRAM + cls.GENERIC,
return cls.YOUTUBE + cls.GENERIC }
elif "reddit.com" in url_lower: return indicators_map.get(platform, cls.GENERIC)
return cls.REDDIT + cls.GENERIC
elif "vk.com" in url_lower:
return cls.VK + cls.GENERIC
elif "t.me" in url_lower:
return cls.TELEGRAM + cls.GENERIC
else:
return cls.GENERIC
def detect_deletion( def detect_deletion(
@@ -153,10 +144,10 @@ def detect_deletion(
page_title: str = None, page_title: str = None,
error_message: str = None, error_message: str = None,
url: str = None, url: str = None,
video_data: dict = None video_data: dict = None,
) -> Optional[Dict[str, any]]: ) -> Optional[Dict[str, any]]:
""" """
Comprehensive deletion detection across multiple signals. Best-effort deletion detection across multiple signals.
Checks HTML content, page titles, error messages, and video metadata for Checks HTML content, page titles, error messages, and video metadata for
indicators that content has been deleted or is unavailable. indicators that content has been deleted or is unavailable.
@@ -191,36 +182,21 @@ def detect_deletion(
for indicator in indicators: for indicator in indicators:
if indicator.lower() in html_content.lower(): if indicator.lower() in html_content.lower():
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}") logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
return { return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
"is_deleted": True,
"indicator": indicator,
"source": "html_content",
"platform": platform
}
# Check page title # Check page title
if page_title: if page_title:
for indicator in indicators: for indicator in indicators:
if indicator.lower() in page_title.lower(): if indicator.lower() in page_title.lower():
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}") logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
return { return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
"is_deleted": True,
"indicator": indicator,
"source": "page_title",
"platform": platform
}
# Check error messages # Check error messages
if error_message: if error_message:
for indicator in indicators: for indicator in indicators:
if indicator.lower() in str(error_message).lower(): if indicator.lower() in str(error_message).lower():
logger.info(f"Deletion detected in error: '{indicator}' found for {url}") logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
return { return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
"is_deleted": True,
"indicator": indicator,
"source": "error_message",
"platform": platform
}
# Check video metadata (from yt-dlp) # Check video metadata (from yt-dlp)
if video_data: if video_data:
@@ -231,7 +207,7 @@ def detect_deletion(
"is_deleted": True, "is_deleted": True,
"indicator": f"availability: {video_data.get('availability')}", "indicator": f"availability: {video_data.get('availability')}",
"source": "video_metadata", "source": "video_metadata",
"platform": platform "platform": platform,
} }
# Check description/title for deletion indicators # Check description/title for deletion indicators
@@ -244,7 +220,7 @@ def detect_deletion(
"is_deleted": True, "is_deleted": True,
"indicator": indicator, "indicator": indicator,
"source": f"video_metadata_{key}", "source": f"video_metadata_{key}",
"platform": platform "platform": platform,
} }
return None return None
@@ -252,34 +228,32 @@ def detect_deletion(
def _extract_platform(url: str) -> str: def _extract_platform(url: str) -> str:
"""Extracts platform name from URL.""" """Extracts platform name from URL."""
url_lower = url.lower() parsed = urlparse(url)
domain = parsed.netloc
if "twitter.com" in url_lower or "x.com" in url_lower: if "twitter.com" in domain or "x.com" in domain:
return "twitter" return "twitter"
elif "facebook.com" in url_lower or "fb.com" in url_lower: elif "facebook.com" in domain or "fb.com" in domain:
return "facebook" return "facebook"
elif "instagram.com" in url_lower: elif "instagram.com" in domain:
return "instagram" return "instagram"
elif "tiktok.com" in url_lower: elif "tiktok.com" in domain:
return "tiktok" return "tiktok"
elif "youtube.com" in url_lower or "youtu.be" in url_lower: elif "youtube.com" in domain or "youtu.be" in domain:
return "youtube" return "youtube"
elif "reddit.com" in url_lower: elif "reddit.com" in domain:
return "reddit" return "reddit"
elif "vk.com" in url_lower: elif "vk.com" in domain:
return "vk" return "vk"
elif "t.me" in url_lower: elif "t.me" in domain:
return "telegram" return "telegram"
else: return "unknown"
return "unknown"
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None: def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
""" """
Flags metadata object as deleted/unavailable. Flags metadata object as deleted/unavailable.
Adds tentative deletion information to the metadata object.
Adds detailed deletion information to the metadata object so investigators
know exactly why and how the deletion was detected.
Args: Args:
metadata: Metadata object to update metadata: Metadata object to update
@@ -291,7 +265,7 @@ def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
metadata.set("deletion_platform", deletion_info.get("platform")) metadata.set("deletion_platform", deletion_info.get("platform"))
metadata.status = "deleted_or_unavailable" metadata.status = "deleted_or_unavailable"
logger.warning( logger.debug(
f"Content marked as deleted/unavailable: " f"Content marked as deleted/unavailable: "
f"platform={deletion_info.get('platform')}, " f"platform={deletion_info.get('platform')}, "
f"indicator='{deletion_info.get('indicator')}', " f"indicator='{deletion_info.get('indicator')}', "

View File

@@ -1,17 +1,12 @@
""" """
Tests for deletion detection utilities. Tests for deletion detection utilities.
These tests verify that the auto-archiver can detect when content These tests verify the current best-effort by the auto-archiver
has been deleted or is unavailable across various platforms. to detect when content has been deleted or is unavailable across
Critical for evidence preservation in conflict documentation. various platforms.
""" """
import pytest from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
from auto_archiver.utils.deletion_detection import (
detect_deletion,
flag_as_deleted,
DeletionIndicators
)
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
@@ -74,10 +69,7 @@ class TestDetectDeletion:
def test_detect_deletion_in_video_metadata(self): def test_detect_deletion_in_video_metadata(self):
"""Test detection via yt-dlp video metadata.""" """Test detection via yt-dlp video metadata."""
video_data = { video_data = {"availability": "unavailable", "title": "Private video"}
"availability": "unavailable",
"title": "Private video"
}
url = "https://youtube.com/watch?v=test123" url = "https://youtube.com/watch?v=test123"
result = detect_deletion(video_data=video_data, url=url) result = detect_deletion(video_data=video_data, url=url)
@@ -93,11 +85,7 @@ class TestDetectDeletion:
title = "My Normal Page" title = "My Normal Page"
url = "https://example.com/page" url = "https://example.com/page"
result = detect_deletion( result = detect_deletion(html_content=html, page_title=title, url=url)
html_content=html,
page_title=title,
url=url
)
assert result is None assert result is None
@@ -133,7 +121,7 @@ class TestFlagAsDeleted:
"is_deleted": True, "is_deleted": True,
"indicator": "This Tweet is unavailable", "indicator": "This Tweet is unavailable",
"source": "html_content", "source": "html_content",
"platform": "twitter" "platform": "twitter",
} }
flag_as_deleted(metadata, deletion_info) flag_as_deleted(metadata, deletion_info)
@@ -151,19 +139,9 @@ class TestFlagAsDeleted:
"is_deleted": True, "is_deleted": True,
"indicator": "Video has been removed by the uploader", "indicator": "Video has been removed by the uploader",
"source": "error_message", "source": "error_message",
"platform": "youtube" "platform": "youtube",
} }
flag_as_deleted(metadata, deletion_info) flag_as_deleted(metadata, deletion_info)
# This metadata can now be stored so investigators know:
# - The content existed but was deleted
# - Exactly what message indicated deletion
# - Which platform it was from
# - When it was checked (via _processed_at)
assert "deletion_indicator" in metadata.metadata assert "deletion_indicator" in metadata.metadata
assert "uploader" in metadata.get("deletion_indicator") assert "uploader" in metadata.get("deletion_indicator")
if __name__ == "__main__":
pytest.main([__file__, "-v"])