mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 03:18:28 +03:00
updates new utils file and test
This commit is contained in:
@@ -1,28 +1,23 @@
|
|||||||
"""
|
"""
|
||||||
Deletion Detection Utilities
|
Deletion Detection Utilities
|
||||||
|
|
||||||
Provides comprehensive detection of deleted, missing, or unavailable content
|
Provides a best-effort detection of deleted, missing, or unavailable content
|
||||||
across various social media platforms. Critical for evidence preservation in
|
across various social media platforms based on presence of expected keywords.
|
||||||
conflict documentation and human rights investigations.
|
|
||||||
|
|
||||||
This module helps investigators identify when content has been removed,
|
This module helps identify removed content, helps to:
|
||||||
allowing them to:
|
- Document content that existed but was deleted
|
||||||
- Document that evidence existed but was deleted
|
|
||||||
- Track patterns of content removal
|
- Track patterns of content removal
|
||||||
- Preserve metadata about missing content
|
- Preserve metadata about missing content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Optional, Dict, List
|
from typing import Optional, Dict, List
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
class DeletionIndicators:
|
class DeletionIndicators:
|
||||||
"""
|
"""
|
||||||
Platform-specific indicators that content has been deleted or is unavailable.
|
Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
|
||||||
|
|
||||||
Covers multiple platforms including Twitter/X, Facebook, Instagram, TikTok,
|
|
||||||
YouTube, Reddit, and VK. Used by conflict investigators to detect when
|
|
||||||
evidence has been removed.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Twitter/X deletion indicators
|
# Twitter/X deletion indicators
|
||||||
@@ -104,10 +99,6 @@ class DeletionIndicators:
|
|||||||
|
|
||||||
# Generic indicators (work across platforms)
|
# Generic indicators (work across platforms)
|
||||||
GENERIC = [
|
GENERIC = [
|
||||||
"404",
|
|
||||||
"not found",
|
|
||||||
"unavailable",
|
|
||||||
"doesn't exist",
|
|
||||||
"has been removed",
|
"has been removed",
|
||||||
"no longer available",
|
"no longer available",
|
||||||
"content removed",
|
"content removed",
|
||||||
@@ -119,33 +110,33 @@ class DeletionIndicators:
|
|||||||
def all_indicators(cls) -> List[str]:
|
def all_indicators(cls) -> List[str]:
|
||||||
"""Returns all deletion indicators from all platforms."""
|
"""Returns all deletion indicators from all platforms."""
|
||||||
return (
|
return (
|
||||||
cls.TWITTER + cls.FACEBOOK + cls.INSTAGRAM + cls.TIKTOK +
|
cls.TWITTER
|
||||||
cls.YOUTUBE + cls.REDDIT + cls.VK + cls.TELEGRAM + cls.GENERIC
|
+ cls.FACEBOOK
|
||||||
|
+ cls.INSTAGRAM
|
||||||
|
+ cls.TIKTOK
|
||||||
|
+ cls.YOUTUBE
|
||||||
|
+ cls.REDDIT
|
||||||
|
+ cls.VK
|
||||||
|
+ cls.TELEGRAM
|
||||||
|
+ cls.GENERIC
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def for_url(cls, url: str) -> List[str]:
|
def for_url(cls, url: str) -> List[str]:
|
||||||
"""Returns platform-specific indicators based on URL domain."""
|
"""Returns platform-specific indicators based on URL domain."""
|
||||||
url_lower = url.lower()
|
platform = _extract_platform(url)
|
||||||
|
|
||||||
if "twitter.com" in url_lower or "x.com" in url_lower:
|
indicators_map = {
|
||||||
return cls.TWITTER + cls.GENERIC
|
"twitter": cls.TWITTER + cls.GENERIC,
|
||||||
elif "facebook.com" in url_lower or "fb.com" in url_lower:
|
"facebook": cls.FACEBOOK + cls.GENERIC,
|
||||||
return cls.FACEBOOK + cls.GENERIC
|
"instagram": cls.INSTAGRAM + cls.GENERIC,
|
||||||
elif "instagram.com" in url_lower:
|
"tiktok": cls.TIKTOK + cls.GENERIC,
|
||||||
return cls.INSTAGRAM + cls.GENERIC
|
"youtube": cls.YOUTUBE + cls.GENERIC,
|
||||||
elif "tiktok.com" in url_lower:
|
"reddit": cls.REDDIT + cls.GENERIC,
|
||||||
return cls.TIKTOK + cls.GENERIC
|
"vk": cls.VK + cls.GENERIC,
|
||||||
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
|
"telegram": cls.TELEGRAM + cls.GENERIC,
|
||||||
return cls.YOUTUBE + cls.GENERIC
|
}
|
||||||
elif "reddit.com" in url_lower:
|
return indicators_map.get(platform, cls.GENERIC)
|
||||||
return cls.REDDIT + cls.GENERIC
|
|
||||||
elif "vk.com" in url_lower:
|
|
||||||
return cls.VK + cls.GENERIC
|
|
||||||
elif "t.me" in url_lower:
|
|
||||||
return cls.TELEGRAM + cls.GENERIC
|
|
||||||
else:
|
|
||||||
return cls.GENERIC
|
|
||||||
|
|
||||||
|
|
||||||
def detect_deletion(
|
def detect_deletion(
|
||||||
@@ -153,10 +144,10 @@ def detect_deletion(
|
|||||||
page_title: str = None,
|
page_title: str = None,
|
||||||
error_message: str = None,
|
error_message: str = None,
|
||||||
url: str = None,
|
url: str = None,
|
||||||
video_data: dict = None
|
video_data: dict = None,
|
||||||
) -> Optional[Dict[str, any]]:
|
) -> Optional[Dict[str, any]]:
|
||||||
"""
|
"""
|
||||||
Comprehensive deletion detection across multiple signals.
|
Best-effort deletion detection across multiple signals.
|
||||||
|
|
||||||
Checks HTML content, page titles, error messages, and video metadata for
|
Checks HTML content, page titles, error messages, and video metadata for
|
||||||
indicators that content has been deleted or is unavailable.
|
indicators that content has been deleted or is unavailable.
|
||||||
@@ -191,36 +182,21 @@ def detect_deletion(
|
|||||||
for indicator in indicators:
|
for indicator in indicators:
|
||||||
if indicator.lower() in html_content.lower():
|
if indicator.lower() in html_content.lower():
|
||||||
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||||
return {
|
return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
|
||||||
"is_deleted": True,
|
|
||||||
"indicator": indicator,
|
|
||||||
"source": "html_content",
|
|
||||||
"platform": platform
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check page title
|
# Check page title
|
||||||
if page_title:
|
if page_title:
|
||||||
for indicator in indicators:
|
for indicator in indicators:
|
||||||
if indicator.lower() in page_title.lower():
|
if indicator.lower() in page_title.lower():
|
||||||
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||||
return {
|
return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
|
||||||
"is_deleted": True,
|
|
||||||
"indicator": indicator,
|
|
||||||
"source": "page_title",
|
|
||||||
"platform": platform
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check error messages
|
# Check error messages
|
||||||
if error_message:
|
if error_message:
|
||||||
for indicator in indicators:
|
for indicator in indicators:
|
||||||
if indicator.lower() in str(error_message).lower():
|
if indicator.lower() in str(error_message).lower():
|
||||||
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||||
return {
|
return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
|
||||||
"is_deleted": True,
|
|
||||||
"indicator": indicator,
|
|
||||||
"source": "error_message",
|
|
||||||
"platform": platform
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check video metadata (from yt-dlp)
|
# Check video metadata (from yt-dlp)
|
||||||
if video_data:
|
if video_data:
|
||||||
@@ -231,7 +207,7 @@ def detect_deletion(
|
|||||||
"is_deleted": True,
|
"is_deleted": True,
|
||||||
"indicator": f"availability: {video_data.get('availability')}",
|
"indicator": f"availability: {video_data.get('availability')}",
|
||||||
"source": "video_metadata",
|
"source": "video_metadata",
|
||||||
"platform": platform
|
"platform": platform,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Check description/title for deletion indicators
|
# Check description/title for deletion indicators
|
||||||
@@ -244,7 +220,7 @@ def detect_deletion(
|
|||||||
"is_deleted": True,
|
"is_deleted": True,
|
||||||
"indicator": indicator,
|
"indicator": indicator,
|
||||||
"source": f"video_metadata_{key}",
|
"source": f"video_metadata_{key}",
|
||||||
"platform": platform
|
"platform": platform,
|
||||||
}
|
}
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@@ -252,34 +228,32 @@ def detect_deletion(
|
|||||||
|
|
||||||
def _extract_platform(url: str) -> str:
|
def _extract_platform(url: str) -> str:
|
||||||
"""Extracts platform name from URL."""
|
"""Extracts platform name from URL."""
|
||||||
url_lower = url.lower()
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc
|
||||||
|
|
||||||
if "twitter.com" in url_lower or "x.com" in url_lower:
|
if "twitter.com" in domain or "x.com" in domain:
|
||||||
return "twitter"
|
return "twitter"
|
||||||
elif "facebook.com" in url_lower or "fb.com" in url_lower:
|
elif "facebook.com" in domain or "fb.com" in domain:
|
||||||
return "facebook"
|
return "facebook"
|
||||||
elif "instagram.com" in url_lower:
|
elif "instagram.com" in domain:
|
||||||
return "instagram"
|
return "instagram"
|
||||||
elif "tiktok.com" in url_lower:
|
elif "tiktok.com" in domain:
|
||||||
return "tiktok"
|
return "tiktok"
|
||||||
elif "youtube.com" in url_lower or "youtu.be" in url_lower:
|
elif "youtube.com" in domain or "youtu.be" in domain:
|
||||||
return "youtube"
|
return "youtube"
|
||||||
elif "reddit.com" in url_lower:
|
elif "reddit.com" in domain:
|
||||||
return "reddit"
|
return "reddit"
|
||||||
elif "vk.com" in url_lower:
|
elif "vk.com" in domain:
|
||||||
return "vk"
|
return "vk"
|
||||||
elif "t.me" in url_lower:
|
elif "t.me" in domain:
|
||||||
return "telegram"
|
return "telegram"
|
||||||
else:
|
return "unknown"
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||||
"""
|
"""
|
||||||
Flags metadata object as deleted/unavailable.
|
Flags metadata object as deleted/unavailable.
|
||||||
|
Adds tentative deletion information to the metadata object.
|
||||||
Adds detailed deletion information to the metadata object so investigators
|
|
||||||
know exactly why and how the deletion was detected.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
metadata: Metadata object to update
|
metadata: Metadata object to update
|
||||||
@@ -291,7 +265,7 @@ def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
|||||||
metadata.set("deletion_platform", deletion_info.get("platform"))
|
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||||
metadata.status = "deleted_or_unavailable"
|
metadata.status = "deleted_or_unavailable"
|
||||||
|
|
||||||
logger.warning(
|
logger.debug(
|
||||||
f"Content marked as deleted/unavailable: "
|
f"Content marked as deleted/unavailable: "
|
||||||
f"platform={deletion_info.get('platform')}, "
|
f"platform={deletion_info.get('platform')}, "
|
||||||
f"indicator='{deletion_info.get('indicator')}', "
|
f"indicator='{deletion_info.get('indicator')}', "
|
||||||
|
|||||||
@@ -1,17 +1,12 @@
|
|||||||
"""
|
"""
|
||||||
Tests for deletion detection utilities.
|
Tests for deletion detection utilities.
|
||||||
|
|
||||||
These tests verify that the auto-archiver can detect when content
|
These tests verify the current best-effort by the auto-archiver
|
||||||
has been deleted or is unavailable across various platforms.
|
to detect when content has been deleted or is unavailable across
|
||||||
Critical for evidence preservation in conflict documentation.
|
various platforms.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pytest
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
|
||||||
from auto_archiver.utils.deletion_detection import (
|
|
||||||
detect_deletion,
|
|
||||||
flag_as_deleted,
|
|
||||||
DeletionIndicators
|
|
||||||
)
|
|
||||||
from auto_archiver.core.metadata import Metadata
|
from auto_archiver.core.metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
@@ -74,10 +69,7 @@ class TestDetectDeletion:
|
|||||||
|
|
||||||
def test_detect_deletion_in_video_metadata(self):
|
def test_detect_deletion_in_video_metadata(self):
|
||||||
"""Test detection via yt-dlp video metadata."""
|
"""Test detection via yt-dlp video metadata."""
|
||||||
video_data = {
|
video_data = {"availability": "unavailable", "title": "Private video"}
|
||||||
"availability": "unavailable",
|
|
||||||
"title": "Private video"
|
|
||||||
}
|
|
||||||
url = "https://youtube.com/watch?v=test123"
|
url = "https://youtube.com/watch?v=test123"
|
||||||
|
|
||||||
result = detect_deletion(video_data=video_data, url=url)
|
result = detect_deletion(video_data=video_data, url=url)
|
||||||
@@ -93,11 +85,7 @@ class TestDetectDeletion:
|
|||||||
title = "My Normal Page"
|
title = "My Normal Page"
|
||||||
url = "https://example.com/page"
|
url = "https://example.com/page"
|
||||||
|
|
||||||
result = detect_deletion(
|
result = detect_deletion(html_content=html, page_title=title, url=url)
|
||||||
html_content=html,
|
|
||||||
page_title=title,
|
|
||||||
url=url
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
@@ -133,7 +121,7 @@ class TestFlagAsDeleted:
|
|||||||
"is_deleted": True,
|
"is_deleted": True,
|
||||||
"indicator": "This Tweet is unavailable",
|
"indicator": "This Tweet is unavailable",
|
||||||
"source": "html_content",
|
"source": "html_content",
|
||||||
"platform": "twitter"
|
"platform": "twitter",
|
||||||
}
|
}
|
||||||
|
|
||||||
flag_as_deleted(metadata, deletion_info)
|
flag_as_deleted(metadata, deletion_info)
|
||||||
@@ -151,19 +139,9 @@ class TestFlagAsDeleted:
|
|||||||
"is_deleted": True,
|
"is_deleted": True,
|
||||||
"indicator": "Video has been removed by the uploader",
|
"indicator": "Video has been removed by the uploader",
|
||||||
"source": "error_message",
|
"source": "error_message",
|
||||||
"platform": "youtube"
|
"platform": "youtube",
|
||||||
}
|
}
|
||||||
|
|
||||||
flag_as_deleted(metadata, deletion_info)
|
flag_as_deleted(metadata, deletion_info)
|
||||||
|
|
||||||
# This metadata can now be stored so investigators know:
|
|
||||||
# - The content existed but was deleted
|
|
||||||
# - Exactly what message indicated deletion
|
|
||||||
# - Which platform it was from
|
|
||||||
# - When it was checked (via _processed_at)
|
|
||||||
assert "deletion_indicator" in metadata.metadata
|
assert "deletion_indicator" in metadata.metadata
|
||||||
assert "uploader" in metadata.get("deletion_indicator")
|
assert "uploader" in metadata.get("deletion_indicator")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pytest.main([__file__, "-v"])
|
|
||||||
|
|||||||
Reference in New Issue
Block a user