mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-08 11:28:28 +03:00
Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c640cc898a | ||
|
|
3e2c0b564b | ||
|
|
5fd23baa55 | ||
|
|
8a450310c7 | ||
|
|
bef8a14089 | ||
|
|
cd0b093e7a | ||
|
|
096c9d09ef | ||
|
|
df3521e9ca | ||
|
|
a89d0193e4 | ||
|
|
536cbd905f | ||
|
|
a936921c4e | ||
|
|
68f672a4fa | ||
|
|
4ee0ad1cf8 | ||
|
|
bac809451c | ||
|
|
53dc9904ce | ||
|
|
c1f312d42a | ||
|
|
23c9dfe717 | ||
|
|
d02e7e0f02 | ||
|
|
56526a9ac7 | ||
|
|
3a22cc28c0 | ||
|
|
dbb3dfa04f | ||
|
|
01bdb35f5d | ||
|
|
43cbc6ac56 | ||
|
|
9c7cab1ae2 | ||
|
|
a9a0bae083 | ||
|
|
94e0803fb3 |
@@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:1.6.3 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.9.2 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
|
||||
1383
poetry.lock
generated
1383
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "1.1.4"
|
||||
version = "1.2.0"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
@@ -50,15 +50,15 @@ dependencies = [
|
||||
"retrying (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
"rfc3161-client (==1.0.3)",
|
||||
"cryptography (>44.0.1,<45.0.0)",
|
||||
"rfc3161-client (>=1.0.5)",
|
||||
"cryptography (>=46.0.3)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||
"pyautogui (>=0.9.54,<0.10.0)",
|
||||
"pyperclip (==1.8.2)",
|
||||
"pyperclip (>=1.9.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.url import is_relevant_url
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
@@ -98,8 +99,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
|
||||
dropin = self._get_suitable_dropin(url, sb)
|
||||
if not dropin.open_page(url):
|
||||
# TODO: could we detect deleted videos?
|
||||
logger.warning("Failed to open drop-in page")
|
||||
# Check for deletion indicators
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
return to_enrich
|
||||
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||
return False
|
||||
|
||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||
@@ -109,7 +116,15 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
sb.wait_for_ready_state_complete()
|
||||
sb.sleep(1) # margin for the page to load completely
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
|
||||
# Check if the page indicates content was deleted
|
||||
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
|
||||
to_enrich.set_title(page_title)
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
|
||||
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.py
|
||||
@@ -4,6 +4,7 @@ import datetime
|
||||
import os
|
||||
import importlib
|
||||
import subprocess
|
||||
import traceback
|
||||
import zipfile
|
||||
|
||||
from typing import Generator, Type
|
||||
@@ -20,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import get_datetime_from_str
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
@@ -305,7 +307,7 @@ class GenericExtractor(Extractor):
|
||||
result.set_url(url)
|
||||
|
||||
if "description" in video_data and not result.get("content"):
|
||||
result.set_content(video_data.get("description"))
|
||||
result.set_content(video_data.pop("description"))
|
||||
# extract comments if enabled
|
||||
if self.comments and video_data.get("comments", None) is not None:
|
||||
result.set(
|
||||
@@ -406,9 +408,9 @@ class GenericExtractor(Extractor):
|
||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||
result.add_media(new_media)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
|
||||
if not len(result.media):
|
||||
logger.info(f"No media found for entry {entry}, skipping.")
|
||||
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
|
||||
return False
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
@@ -483,6 +485,13 @@ class GenericExtractor(Extractor):
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
|
||||
# Check for deletion indicators in video data
|
||||
deletion_info = detect_deletion(video_data=data, url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
@@ -502,6 +511,13 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
# Check if the error indicates deletion
|
||||
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
if "NSFW tweet requires authentication." in str(post_e):
|
||||
logger.warning(str(post_e))
|
||||
return False
|
||||
@@ -604,9 +620,9 @@ class GenericExtractor(Extractor):
|
||||
validated_options
|
||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
result: Metadata = None
|
||||
for info_extractor in self.suitable_extractors(url):
|
||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if result:
|
||||
return result
|
||||
|
||||
return False
|
||||
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if local_result:
|
||||
result = result.merge(local_result) if result else local_result
|
||||
return result if result else False
|
||||
|
||||
@@ -7,6 +7,7 @@ from slugify import slugify
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||
|
||||
|
||||
@@ -37,7 +38,15 @@ class Twitter(GenericDropin):
|
||||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
# Check for deletion indicators
|
||||
deletion_info = detect_deletion(
|
||||
video_data=tweet, url=url, error_message="Missing user or created_at fields"
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
|
||||
@@ -3,6 +3,13 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||
"configs": {
|
||||
"look_for_keys": {
|
||||
"default": [],
|
||||
"help": "list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.",
|
||||
"type": "list",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Extracts metadata information from files using ExifTool.
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ class MetadataEnricher(Enricher):
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if len(md := self.get_metadata(m.filename)):
|
||||
if self.look_for_keys != []:
|
||||
md = self.select_metadata(md, self.look_for_keys)
|
||||
to_enrich.media[i].set("metadata", md)
|
||||
|
||||
def get_metadata(self, filename: str) -> dict:
|
||||
@@ -23,7 +25,6 @@ class MetadataEnricher(Enricher):
|
||||
# Run ExifTool command to extract metadata from the file
|
||||
cmd = ["exiftool", filename]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# Process the output to extract individual metadata fields
|
||||
metadata = {}
|
||||
for line in result.stdout.splitlines():
|
||||
@@ -35,3 +36,33 @@ class MetadataEnricher(Enricher):
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||
return {}
|
||||
|
||||
def select_metadata(self, all_md, requested_metadata_keys):
|
||||
"""
|
||||
coordinates the selection of metadata from the general exiftool output to the user-specified grocery list
|
||||
"""
|
||||
# defining the batches of metadata that get pulled for special terms
|
||||
author_key_terms = ["author", "producer", "creator"]
|
||||
datetime_key_terms = ["date", "time"]
|
||||
location_key_terms = ["gps", "latitude", "longitude"]
|
||||
|
||||
specified_md = {}
|
||||
for md_key in all_md.keys():
|
||||
md_key_lower = md_key.lower()
|
||||
# checking for special baskets within the grocery list of requested metadata
|
||||
if ("author" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in author_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
if ("datetime" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
if ("location" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in location_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
# if the metadata value is requested directly
|
||||
if md_key_lower in requested_metadata_keys or md_key in requested_metadata_keys and len(all_md[md_key]):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
return specified_md
|
||||
|
||||
@@ -2,6 +2,13 @@ from loguru import logger
|
||||
import json
|
||||
|
||||
|
||||
def type_serializer(obj):
|
||||
"""Fallback function for objects json can't handle."""
|
||||
if isinstance(obj, type):
|
||||
return obj.__name__
|
||||
return str(obj)
|
||||
|
||||
|
||||
def extract_location(record, short=False):
|
||||
"""Extracts the file name, function name, and line number from the log record."""
|
||||
if short:
|
||||
@@ -35,11 +42,11 @@ def serialize_for_console(record):
|
||||
subset.pop("time", None)
|
||||
if not subset:
|
||||
return ""
|
||||
return json.dumps(subset, ensure_ascii=False)
|
||||
return json.dumps(subset, ensure_ascii=False, default=type_serializer)
|
||||
|
||||
|
||||
def serialize(record):
|
||||
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
||||
return json.dumps(extract_log_data(record), ensure_ascii=False, default=type_serializer)
|
||||
|
||||
|
||||
def patching(record):
|
||||
|
||||
273
src/auto_archiver/utils/deletion_detection.py
Normal file
273
src/auto_archiver/utils/deletion_detection.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
Deletion Detection Utilities
|
||||
|
||||
Provides a best-effort detection of deleted, missing, or unavailable content
|
||||
across various social media platforms based on presence of expected keywords.
|
||||
|
||||
This module helps identify removed content, helps to:
|
||||
- Document content that existed but was deleted
|
||||
- Track patterns of content removal
|
||||
- Preserve metadata about missing content
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, List
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class DeletionIndicators:
|
||||
"""
|
||||
Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
|
||||
"""
|
||||
|
||||
# Twitter/X deletion indicators
|
||||
TWITTER = [
|
||||
"Hmm...this page doesn't exist",
|
||||
"Try searching for something else",
|
||||
"This Tweet is unavailable",
|
||||
"This account doesn't exist",
|
||||
"This Tweet has been deleted",
|
||||
"This account has been suspended",
|
||||
"Sorry, that page doesn't exist",
|
||||
"The Tweet you're looking for isn't available",
|
||||
]
|
||||
|
||||
# Facebook deletion indicators
|
||||
FACEBOOK = [
|
||||
"This content isn't available",
|
||||
"Sorry, this content isn't available",
|
||||
"This content is no longer available",
|
||||
"The link you followed may be broken",
|
||||
"Page Not Found",
|
||||
"Content Not Found",
|
||||
"This content is no longer on Facebook",
|
||||
]
|
||||
|
||||
# Instagram deletion indicators
|
||||
INSTAGRAM = [
|
||||
"Sorry, this page isn't available",
|
||||
"The link you followed may be broken",
|
||||
"Media not found or unavailable",
|
||||
"This post is no longer available",
|
||||
"This account is private",
|
||||
]
|
||||
|
||||
# TikTok deletion indicators
|
||||
TIKTOK = [
|
||||
"Couldn't find this account",
|
||||
"This video is no longer available",
|
||||
"This video is currently unavailable",
|
||||
"Video not found",
|
||||
"This video may have been deleted",
|
||||
]
|
||||
|
||||
# YouTube deletion indicators
|
||||
YOUTUBE = [
|
||||
"This video isn't available anymore",
|
||||
"Video unavailable",
|
||||
"This video has been removed",
|
||||
"This video is no longer available",
|
||||
"This video is private",
|
||||
"This video has been removed by the uploader",
|
||||
"This video has been deleted",
|
||||
]
|
||||
|
||||
# Reddit deletion indicators
|
||||
REDDIT = [
|
||||
"this post has been removed",
|
||||
"this comment has been removed",
|
||||
"[removed]",
|
||||
"[deleted]",
|
||||
"page not found",
|
||||
"there doesn't seem to be anything here",
|
||||
]
|
||||
|
||||
# VK deletion indicators
|
||||
VK = [
|
||||
"Post deleted",
|
||||
"Page not found",
|
||||
"Content unavailable",
|
||||
"Access denied",
|
||||
]
|
||||
|
||||
# Telegram deletion indicators
|
||||
TELEGRAM = [
|
||||
"Message not found",
|
||||
"Deleted message",
|
||||
"Channel is private",
|
||||
]
|
||||
|
||||
# Generic indicators (work across platforms)
|
||||
GENERIC = [
|
||||
"has been removed",
|
||||
"no longer available",
|
||||
"content removed",
|
||||
"access denied",
|
||||
"page not found",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def all_indicators(cls) -> List[str]:
|
||||
"""Returns all deletion indicators from all platforms."""
|
||||
return (
|
||||
cls.TWITTER
|
||||
+ cls.FACEBOOK
|
||||
+ cls.INSTAGRAM
|
||||
+ cls.TIKTOK
|
||||
+ cls.YOUTUBE
|
||||
+ cls.REDDIT
|
||||
+ cls.VK
|
||||
+ cls.TELEGRAM
|
||||
+ cls.GENERIC
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_url(cls, url: str) -> List[str]:
|
||||
"""Returns platform-specific indicators based on URL domain."""
|
||||
platform = _extract_platform(url)
|
||||
|
||||
indicators_map = {
|
||||
"twitter": cls.TWITTER + cls.GENERIC,
|
||||
"facebook": cls.FACEBOOK + cls.GENERIC,
|
||||
"instagram": cls.INSTAGRAM + cls.GENERIC,
|
||||
"tiktok": cls.TIKTOK + cls.GENERIC,
|
||||
"youtube": cls.YOUTUBE + cls.GENERIC,
|
||||
"reddit": cls.REDDIT + cls.GENERIC,
|
||||
"vk": cls.VK + cls.GENERIC,
|
||||
"telegram": cls.TELEGRAM + cls.GENERIC,
|
||||
}
|
||||
return indicators_map.get(platform, cls.GENERIC)
|
||||
|
||||
|
||||
def detect_deletion(
|
||||
html_content: str = None,
|
||||
page_title: str = None,
|
||||
error_message: str = None,
|
||||
url: str = None,
|
||||
video_data: dict = None,
|
||||
) -> Optional[Dict[str, any]]:
|
||||
"""
|
||||
Best-effort deletion detection across multiple signals.
|
||||
|
||||
Checks HTML content, page titles, error messages, and video metadata for
|
||||
indicators that content has been deleted or is unavailable.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML source of the page
|
||||
page_title: Browser page title
|
||||
error_message: Any error message from the extractor
|
||||
url: The URL being archived (for platform-specific detection)
|
||||
video_data: Video metadata from yt-dlp or other extractors
|
||||
|
||||
Returns:
|
||||
Dictionary with deletion details if detected, None otherwise.
|
||||
Format: {
|
||||
"is_deleted": True,
|
||||
"indicator": "specific text that was found",
|
||||
"source": "html|title|error|metadata",
|
||||
"platform": "twitter|facebook|etc"
|
||||
}
|
||||
"""
|
||||
|
||||
# Determine indicators to check based on URL
|
||||
if url:
|
||||
indicators = DeletionIndicators.for_url(url)
|
||||
platform = _extract_platform(url)
|
||||
else:
|
||||
indicators = DeletionIndicators.all_indicators()
|
||||
platform = "unknown"
|
||||
|
||||
# Check HTML content
|
||||
if html_content:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in html_content.lower():
|
||||
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
|
||||
|
||||
# Check page title
|
||||
if page_title:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in page_title.lower():
|
||||
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
|
||||
|
||||
# Check error messages
|
||||
if error_message:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(error_message).lower():
|
||||
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
|
||||
|
||||
# Check video metadata (from yt-dlp)
|
||||
if video_data:
|
||||
# Check if yt-dlp flagged it as unavailable
|
||||
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
|
||||
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": f"availability: {video_data.get('availability')}",
|
||||
"source": "video_metadata",
|
||||
"platform": platform,
|
||||
}
|
||||
|
||||
# Check description/title for deletion indicators
|
||||
for key in ["title", "description", "fulltitle"]:
|
||||
if key in video_data:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(video_data[key]).lower():
|
||||
logger.info(f"Deletion detected in {key}: '{indicator}'")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": f"video_metadata_{key}",
|
||||
"platform": platform,
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_platform(url: str) -> str:
|
||||
"""Extracts platform name from URL."""
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc
|
||||
|
||||
if "twitter.com" in domain or "x.com" in domain:
|
||||
return "twitter"
|
||||
elif "facebook.com" in domain or "fb.com" in domain:
|
||||
return "facebook"
|
||||
elif "instagram.com" in domain:
|
||||
return "instagram"
|
||||
elif "tiktok.com" in domain:
|
||||
return "tiktok"
|
||||
elif "youtube.com" in domain or "youtu.be" in domain:
|
||||
return "youtube"
|
||||
elif "reddit.com" in domain:
|
||||
return "reddit"
|
||||
elif "vk.com" in domain:
|
||||
return "vk"
|
||||
elif "t.me" in domain:
|
||||
return "telegram"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||
"""
|
||||
Flags metadata object as deleted/unavailable.
|
||||
Adds tentative deletion information to the metadata object.
|
||||
|
||||
Args:
|
||||
metadata: Metadata object to update
|
||||
deletion_info: Dictionary from detect_deletion()
|
||||
"""
|
||||
metadata.set("deletion_detected", True)
|
||||
metadata.set("deletion_indicator", deletion_info.get("indicator"))
|
||||
metadata.set("deletion_source", deletion_info.get("source"))
|
||||
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||
metadata.status = "deleted_or_unavailable"
|
||||
|
||||
logger.debug(
|
||||
f"Content marked as deleted/unavailable: "
|
||||
f"platform={deletion_info.get('platform')}, "
|
||||
f"indicator='{deletion_info.get('indicator')}', "
|
||||
f"source={deletion_info.get('source')}"
|
||||
)
|
||||
@@ -56,6 +56,19 @@ def test_enrich_sets_metadata(enricher, mocker):
|
||||
assert metadata.media == [media1, media2]
|
||||
|
||||
|
||||
def test_enrich_no_metadata_selection(enricher, mocker):
|
||||
media1 = mocker.Mock(filename="img1.jpg")
|
||||
media2 = mocker.Mock(filename="img2.jpg")
|
||||
metadata = mocker.Mock()
|
||||
metadata.media = [media1, media2]
|
||||
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
|
||||
enricher.look_for_keys = ["no-key"]
|
||||
enricher.enrich(metadata)
|
||||
media1.set.assert_called_once_with("metadata", {})
|
||||
media2.set.assert_not_called()
|
||||
assert metadata.media == [media1, media2]
|
||||
|
||||
|
||||
def test_enrich_empty_media(enricher, mocker):
|
||||
metadata = mocker.Mock()
|
||||
metadata.media = []
|
||||
@@ -71,7 +84,9 @@ def test_get_metadata_error_handling(enricher, mocker):
|
||||
assert "Error occurred: " in mock_log.call_args[0][0]
|
||||
|
||||
|
||||
def test_metadata_pickle(enricher, unpickle, mocker):
|
||||
# TODO depends on the expected functionality
|
||||
"""
|
||||
def test_default_metadata_pickle(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
# Uses pickled values
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
@@ -79,6 +94,39 @@ def test_metadata_pickle(enricher, unpickle, mocker):
|
||||
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
||||
enricher.enrich(metadata)
|
||||
expected_media = expected.media
|
||||
print(expected_media)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert len(expected_media) == len(actual_media)
|
||||
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
||||
"""
|
||||
|
||||
|
||||
def test_metadata_pickle_megapixel(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||
|
||||
enricher.look_for_keys = ["megapixels"]
|
||||
enricher.enrich(metadata)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"}
|
||||
|
||||
|
||||
def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||
|
||||
enricher.look_for_keys = ["datetime", "megapixels", "image height"]
|
||||
enricher.enrich(metadata)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert actual_media[0].properties.get("metadata") == {
|
||||
"File Modification Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"File Access Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"File Inode Change Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"Megapixels": "0.922",
|
||||
"Image Height": "720",
|
||||
}
|
||||
|
||||
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||
|
||||
|
||||
class DummySB:
|
||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||
self._url = url
|
||||
@@ -51,7 +54,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||
[
|
||||
(
|
||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||
@@ -59,6 +62,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Tyto alba",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||
@@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Bellingcat has geolocated",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||
@@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"continued the work of Gazan journalists",
|
||||
5,
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/about/general-information",
|
||||
@@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Stichting Bellingcat",
|
||||
0, # SVGs are ignored
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||
@@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"16 сентября 1985 года лейблом EMI Records.",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||
@@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Dito ko lang",
|
||||
1,
|
||||
0,
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
||||
def test_download_pages_with_media(
|
||||
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||
):
|
||||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
if CI and skip_ci:
|
||||
pytest.skip("Skipping test in CI environment")
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
self.config
|
||||
|
||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
||||
],
|
||||
)
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
def test_bluesky_download_video(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.extractor.download(item)
|
||||
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||
|
||||
147
tests/utils/test_deletion_detection.py
Normal file
147
tests/utils/test_deletion_detection.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Tests for deletion detection utilities.
|
||||
|
||||
These tests verify the current best-effort by the auto-archiver
|
||||
to detect when content has been deleted or is unavailable across
|
||||
various platforms.
|
||||
"""
|
||||
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
|
||||
class TestDeletionIndicators:
|
||||
"""Test the deletion indicator lists for various platforms."""
|
||||
|
||||
def test_twitter_indicators(self):
|
||||
"""Verify Twitter deletion indicators are comprehensive."""
|
||||
assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER
|
||||
assert "Try searching for something else" in DeletionIndicators.TWITTER
|
||||
assert "This Tweet is unavailable" in DeletionIndicators.TWITTER
|
||||
|
||||
def test_platform_specific_indicators(self):
|
||||
"""Test that platform-specific indicators are returned based on URL."""
|
||||
twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123")
|
||||
assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators)
|
||||
|
||||
instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123")
|
||||
assert any("page isn't available" in ind.lower() for ind in instagram_indicators)
|
||||
|
||||
|
||||
class TestDetectDeletion:
|
||||
"""Test the detect_deletion function with various inputs."""
|
||||
|
||||
def test_detect_deletion_in_html_twitter(self):
|
||||
"""Test detection of Twitter's deleted post page."""
|
||||
html = "<html><body>Hmm...this page doesn't exist. Try searching for something else.</body></html>"
|
||||
url = "https://twitter.com/user/status/123"
|
||||
|
||||
result = detect_deletion(html_content=html, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["platform"] == "twitter"
|
||||
assert result["source"] == "html_content"
|
||||
assert "page doesn't exist" in result["indicator"].lower()
|
||||
|
||||
def test_detect_deletion_in_page_title(self):
|
||||
"""Test detection via page title."""
|
||||
title = "Page Not Found"
|
||||
url = "https://facebook.com/post/123"
|
||||
|
||||
result = detect_deletion(page_title=title, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["source"] == "page_title"
|
||||
|
||||
def test_detect_deletion_in_error_message(self):
|
||||
"""Test detection via error messages."""
|
||||
error = "yt_dlp.utils.DownloadError: This video is no longer available"
|
||||
url = "https://youtube.com/watch?v=abc123"
|
||||
|
||||
result = detect_deletion(error_message=error, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["platform"] == "youtube"
|
||||
assert result["source"] == "error_message"
|
||||
|
||||
def test_detect_deletion_in_video_metadata(self):
|
||||
"""Test detection via yt-dlp video metadata."""
|
||||
video_data = {"availability": "unavailable", "title": "Private video"}
|
||||
url = "https://youtube.com/watch?v=test123"
|
||||
|
||||
result = detect_deletion(video_data=video_data, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["source"] == "video_metadata"
|
||||
assert "availability" in result["indicator"]
|
||||
|
||||
def test_no_deletion_detected(self):
|
||||
"""Test that normal content is not flagged as deleted."""
|
||||
html = "<html><body><h1>Welcome to my page</h1><p>This is normal content.</p></body></html>"
|
||||
title = "My Normal Page"
|
||||
url = "https://example.com/page"
|
||||
|
||||
result = detect_deletion(html_content=html, page_title=title, url=url)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_instagram_media_not_found(self):
|
||||
"""Test Instagram-specific deletion message."""
|
||||
error = "Media not found or unavailable"
|
||||
url = "https://instagram.com/p/ABC123"
|
||||
|
||||
result = detect_deletion(error_message=error, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["platform"] == "instagram"
|
||||
assert "not found" in result["indicator"].lower()
|
||||
|
||||
def test_reddit_removed_content(self):
|
||||
"""Test Reddit [removed] and [deleted] markers."""
|
||||
html = "<div class='comment'>[removed]</div>"
|
||||
url = "https://reddit.com/r/test/comments/abc123"
|
||||
|
||||
result = detect_deletion(html_content=html, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["platform"] == "reddit"
|
||||
|
||||
|
||||
class TestFlagAsDeleted:
|
||||
"""Test the flag_as_deleted function."""
|
||||
|
||||
def test_flag_metadata_as_deleted(self):
|
||||
"""Verify that metadata is properly flagged with deletion info."""
|
||||
metadata = Metadata()
|
||||
deletion_info = {
|
||||
"is_deleted": True,
|
||||
"indicator": "This Tweet is unavailable",
|
||||
"source": "html_content",
|
||||
"platform": "twitter",
|
||||
}
|
||||
|
||||
flag_as_deleted(metadata, deletion_info)
|
||||
|
||||
assert metadata.get("deletion_detected") is True
|
||||
assert metadata.get("deletion_indicator") == "This Tweet is unavailable"
|
||||
assert metadata.get("deletion_source") == "html_content"
|
||||
assert metadata.get("deletion_platform") == "twitter"
|
||||
assert metadata.status == "deleted_or_unavailable"
|
||||
|
||||
def test_metadata_contains_deletion_context(self):
|
||||
"""Verify investigators have full context about the deletion."""
|
||||
metadata = Metadata()
|
||||
deletion_info = {
|
||||
"is_deleted": True,
|
||||
"indicator": "Video has been removed by the uploader",
|
||||
"source": "error_message",
|
||||
"platform": "youtube",
|
||||
}
|
||||
|
||||
flag_as_deleted(metadata, deletion_info)
|
||||
assert "deletion_indicator" in metadata.metadata
|
||||
assert "uploader" in metadata.get("deletion_indicator")
|
||||
Reference in New Issue
Block a user