mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c640cc898a | ||
|
|
3e2c0b564b | ||
|
|
5fd23baa55 | ||
|
|
8a450310c7 | ||
|
|
bef8a14089 | ||
|
|
cd0b093e7a | ||
|
|
096c9d09ef | ||
|
|
df3521e9ca | ||
|
|
a89d0193e4 | ||
|
|
536cbd905f | ||
|
|
a936921c4e | ||
|
|
68f672a4fa | ||
|
|
4ee0ad1cf8 | ||
|
|
bac809451c | ||
|
|
53dc9904ce | ||
|
|
c1f312d42a | ||
|
|
23c9dfe717 | ||
|
|
d02e7e0f02 | ||
|
|
56526a9ac7 | ||
|
|
3a22cc28c0 | ||
|
|
dbb3dfa04f | ||
|
|
01bdb35f5d | ||
|
|
43cbc6ac56 | ||
|
|
9c7cab1ae2 | ||
|
|
a9a0bae083 | ||
|
|
94e0803fb3 |
@@ -1,4 +1,4 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:1.6.3 AS base
|
FROM webrecorder/browsertrix-crawler:1.9.2 AS base
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1 \
|
ENV RUNNING_IN_DOCKER=1 \
|
||||||
LANG=C.UTF-8 \
|
LANG=C.UTF-8 \
|
||||||
|
|||||||
1383
poetry.lock
generated
1383
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.1.4"
|
version = "1.2.0"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
@@ -50,15 +50,15 @@ dependencies = [
|
|||||||
"retrying (>=0.0.0)",
|
"retrying (>=0.0.0)",
|
||||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||||
"rfc3161-client (==1.0.3)",
|
"rfc3161-client (>=1.0.5)",
|
||||||
"cryptography (>44.0.1,<45.0.0)",
|
"cryptography (>=46.0.3)",
|
||||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||||
"secretstorage (>=3.3.3,<4.0.0)",
|
"secretstorage (>=3.3.3,<4.0.0)",
|
||||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||||
"pyautogui (>=0.9.54,<0.10.0)",
|
"pyautogui (>=0.9.54,<0.10.0)",
|
||||||
"pyperclip (==1.8.2)",
|
"pyperclip (>=1.9.0)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
|||||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
from auto_archiver.utils.url import is_relevant_url
|
from auto_archiver.utils.url import is_relevant_url
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
|
|
||||||
|
|
||||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||||
@@ -98,8 +99,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
|
|
||||||
dropin = self._get_suitable_dropin(url, sb)
|
dropin = self._get_suitable_dropin(url, sb)
|
||||||
if not dropin.open_page(url):
|
if not dropin.open_page(url):
|
||||||
# TODO: could we detect deleted videos?
|
# Check for deletion indicators
|
||||||
logger.warning("Failed to open drop-in page")
|
page_title = sb.get_title()
|
||||||
|
html_source = sb.get_page_source()
|
||||||
|
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(to_enrich, deletion_info)
|
||||||
|
return to_enrich
|
||||||
|
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||||
@@ -109,7 +116,15 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
sb.wait_for_ready_state_complete()
|
sb.wait_for_ready_state_complete()
|
||||||
sb.sleep(1) # margin for the page to load completely
|
sb.sleep(1) # margin for the page to load completely
|
||||||
|
|
||||||
to_enrich.set_title(sb.get_title())
|
page_title = sb.get_title()
|
||||||
|
html_source = sb.get_page_source()
|
||||||
|
|
||||||
|
# Check if the page indicates content was deleted
|
||||||
|
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(to_enrich, deletion_info)
|
||||||
|
|
||||||
|
to_enrich.set_title(page_title)
|
||||||
self._enrich_html_source_code(sb, to_enrich)
|
self._enrich_html_source_code(sb, to_enrich)
|
||||||
|
|
||||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||||
|
|||||||
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.py
|
||||||
@@ -4,6 +4,7 @@ import datetime
|
|||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from typing import Generator, Type
|
from typing import Generator, Type
|
||||||
@@ -20,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
|||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.utils import get_datetime_from_str
|
from auto_archiver.utils import get_datetime_from_str
|
||||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from .dropin import GenericDropin
|
from .dropin import GenericDropin
|
||||||
|
|
||||||
|
|
||||||
@@ -305,7 +307,7 @@ class GenericExtractor(Extractor):
|
|||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
|
|
||||||
if "description" in video_data and not result.get("content"):
|
if "description" in video_data and not result.get("content"):
|
||||||
result.set_content(video_data.get("description"))
|
result.set_content(video_data.pop("description"))
|
||||||
# extract comments if enabled
|
# extract comments if enabled
|
||||||
if self.comments and video_data.get("comments", None) is not None:
|
if self.comments and video_data.get("comments", None) is not None:
|
||||||
result.set(
|
result.set(
|
||||||
@@ -406,9 +408,9 @@ class GenericExtractor(Extractor):
|
|||||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||||
result.add_media(new_media)
|
result.add_media(new_media)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing entry {entry}: {e}")
|
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
|
||||||
if not len(result.media):
|
if not len(result.media):
|
||||||
logger.info(f"No media found for entry {entry}, skipping.")
|
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
@@ -483,6 +485,13 @@ class GenericExtractor(Extractor):
|
|||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||||
|
|
||||||
|
# Check for deletion indicators in video data
|
||||||
|
deletion_info = detect_deletion(video_data=data, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
result = Metadata()
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
except MaxDownloadsReached:
|
except MaxDownloadsReached:
|
||||||
@@ -502,6 +511,13 @@ class GenericExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||||
|
# Check if the error indicates deletion
|
||||||
|
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||||
|
if deletion_info:
|
||||||
|
result = Metadata()
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
if "NSFW tweet requires authentication." in str(post_e):
|
if "NSFW tweet requires authentication." in str(post_e):
|
||||||
logger.warning(str(post_e))
|
logger.warning(str(post_e))
|
||||||
return False
|
return False
|
||||||
@@ -604,9 +620,9 @@ class GenericExtractor(Extractor):
|
|||||||
validated_options
|
validated_options
|
||||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||||
|
|
||||||
|
result: Metadata = None
|
||||||
for info_extractor in self.suitable_extractors(url):
|
for info_extractor in self.suitable_extractors(url):
|
||||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
|
||||||
if result:
|
if local_result:
|
||||||
return result
|
result = result.merge(local_result) if result else local_result
|
||||||
|
return result if result else False
|
||||||
return False
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from slugify import slugify
|
|||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
@@ -37,7 +38,15 @@ class Twitter(GenericDropin):
|
|||||||
result = Metadata()
|
result = Metadata()
|
||||||
try:
|
try:
|
||||||
if not tweet.get("user") or not tweet.get("created_at"):
|
if not tweet.get("user") or not tweet.get("created_at"):
|
||||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
# Check for deletion indicators
|
||||||
|
deletion_info = detect_deletion(
|
||||||
|
video_data=tweet, url=url, error_message="Missing user or created_at fields"
|
||||||
|
)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
|
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
|
|||||||
@@ -3,6 +3,13 @@
|
|||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||||
|
"configs": {
|
||||||
|
"look_for_keys": {
|
||||||
|
"default": [],
|
||||||
|
"help": "list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.",
|
||||||
|
"type": "list",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Extracts metadata information from files using ExifTool.
|
Extracts metadata information from files using ExifTool.
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ class MetadataEnricher(Enricher):
|
|||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if len(md := self.get_metadata(m.filename)):
|
if len(md := self.get_metadata(m.filename)):
|
||||||
|
if self.look_for_keys != []:
|
||||||
|
md = self.select_metadata(md, self.look_for_keys)
|
||||||
to_enrich.media[i].set("metadata", md)
|
to_enrich.media[i].set("metadata", md)
|
||||||
|
|
||||||
def get_metadata(self, filename: str) -> dict:
|
def get_metadata(self, filename: str) -> dict:
|
||||||
@@ -23,7 +25,6 @@ class MetadataEnricher(Enricher):
|
|||||||
# Run ExifTool command to extract metadata from the file
|
# Run ExifTool command to extract metadata from the file
|
||||||
cmd = ["exiftool", filename]
|
cmd = ["exiftool", filename]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
# Process the output to extract individual metadata fields
|
# Process the output to extract individual metadata fields
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for line in result.stdout.splitlines():
|
for line in result.stdout.splitlines():
|
||||||
@@ -35,3 +36,33 @@ class MetadataEnricher(Enricher):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def select_metadata(self, all_md, requested_metadata_keys):
|
||||||
|
"""
|
||||||
|
coordinates the selection of metadata from the general exiftool output to the user-specified grocery list
|
||||||
|
"""
|
||||||
|
# defining the batches of metadata that get pulled for special terms
|
||||||
|
author_key_terms = ["author", "producer", "creator"]
|
||||||
|
datetime_key_terms = ["date", "time"]
|
||||||
|
location_key_terms = ["gps", "latitude", "longitude"]
|
||||||
|
|
||||||
|
specified_md = {}
|
||||||
|
for md_key in all_md.keys():
|
||||||
|
md_key_lower = md_key.lower()
|
||||||
|
# checking for special baskets within the grocery list of requested metadata
|
||||||
|
if ("author" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in author_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
if ("datetime" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
if ("location" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in location_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
# if the metadata value is requested directly
|
||||||
|
if md_key_lower in requested_metadata_keys or md_key in requested_metadata_keys and len(all_md[md_key]):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
return specified_md
|
||||||
|
|||||||
@@ -2,6 +2,13 @@ from loguru import logger
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def type_serializer(obj):
|
||||||
|
"""Fallback function for objects json can't handle."""
|
||||||
|
if isinstance(obj, type):
|
||||||
|
return obj.__name__
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
def extract_location(record, short=False):
|
def extract_location(record, short=False):
|
||||||
"""Extracts the file name, function name, and line number from the log record."""
|
"""Extracts the file name, function name, and line number from the log record."""
|
||||||
if short:
|
if short:
|
||||||
@@ -35,11 +42,11 @@ def serialize_for_console(record):
|
|||||||
subset.pop("time", None)
|
subset.pop("time", None)
|
||||||
if not subset:
|
if not subset:
|
||||||
return ""
|
return ""
|
||||||
return json.dumps(subset, ensure_ascii=False)
|
return json.dumps(subset, ensure_ascii=False, default=type_serializer)
|
||||||
|
|
||||||
|
|
||||||
def serialize(record):
|
def serialize(record):
|
||||||
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
return json.dumps(extract_log_data(record), ensure_ascii=False, default=type_serializer)
|
||||||
|
|
||||||
|
|
||||||
def patching(record):
|
def patching(record):
|
||||||
|
|||||||
273
src/auto_archiver/utils/deletion_detection.py
Normal file
273
src/auto_archiver/utils/deletion_detection.py
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
"""
|
||||||
|
Deletion Detection Utilities
|
||||||
|
|
||||||
|
Provides a best-effort detection of deleted, missing, or unavailable content
|
||||||
|
across various social media platforms based on presence of expected keywords.
|
||||||
|
|
||||||
|
This module helps identify removed content, helps to:
|
||||||
|
- Document content that existed but was deleted
|
||||||
|
- Track patterns of content removal
|
||||||
|
- Preserve metadata about missing content
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional, Dict, List
|
||||||
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
class DeletionIndicators:
|
||||||
|
"""
|
||||||
|
Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Twitter/X deletion indicators
|
||||||
|
TWITTER = [
|
||||||
|
"Hmm...this page doesn't exist",
|
||||||
|
"Try searching for something else",
|
||||||
|
"This Tweet is unavailable",
|
||||||
|
"This account doesn't exist",
|
||||||
|
"This Tweet has been deleted",
|
||||||
|
"This account has been suspended",
|
||||||
|
"Sorry, that page doesn't exist",
|
||||||
|
"The Tweet you're looking for isn't available",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Facebook deletion indicators
|
||||||
|
FACEBOOK = [
|
||||||
|
"This content isn't available",
|
||||||
|
"Sorry, this content isn't available",
|
||||||
|
"This content is no longer available",
|
||||||
|
"The link you followed may be broken",
|
||||||
|
"Page Not Found",
|
||||||
|
"Content Not Found",
|
||||||
|
"This content is no longer on Facebook",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Instagram deletion indicators
|
||||||
|
INSTAGRAM = [
|
||||||
|
"Sorry, this page isn't available",
|
||||||
|
"The link you followed may be broken",
|
||||||
|
"Media not found or unavailable",
|
||||||
|
"This post is no longer available",
|
||||||
|
"This account is private",
|
||||||
|
]
|
||||||
|
|
||||||
|
# TikTok deletion indicators
|
||||||
|
TIKTOK = [
|
||||||
|
"Couldn't find this account",
|
||||||
|
"This video is no longer available",
|
||||||
|
"This video is currently unavailable",
|
||||||
|
"Video not found",
|
||||||
|
"This video may have been deleted",
|
||||||
|
]
|
||||||
|
|
||||||
|
# YouTube deletion indicators
|
||||||
|
YOUTUBE = [
|
||||||
|
"This video isn't available anymore",
|
||||||
|
"Video unavailable",
|
||||||
|
"This video has been removed",
|
||||||
|
"This video is no longer available",
|
||||||
|
"This video is private",
|
||||||
|
"This video has been removed by the uploader",
|
||||||
|
"This video has been deleted",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reddit deletion indicators
|
||||||
|
REDDIT = [
|
||||||
|
"this post has been removed",
|
||||||
|
"this comment has been removed",
|
||||||
|
"[removed]",
|
||||||
|
"[deleted]",
|
||||||
|
"page not found",
|
||||||
|
"there doesn't seem to be anything here",
|
||||||
|
]
|
||||||
|
|
||||||
|
# VK deletion indicators
|
||||||
|
VK = [
|
||||||
|
"Post deleted",
|
||||||
|
"Page not found",
|
||||||
|
"Content unavailable",
|
||||||
|
"Access denied",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Telegram deletion indicators
|
||||||
|
TELEGRAM = [
|
||||||
|
"Message not found",
|
||||||
|
"Deleted message",
|
||||||
|
"Channel is private",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Generic indicators (work across platforms)
|
||||||
|
GENERIC = [
|
||||||
|
"has been removed",
|
||||||
|
"no longer available",
|
||||||
|
"content removed",
|
||||||
|
"access denied",
|
||||||
|
"page not found",
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def all_indicators(cls) -> List[str]:
|
||||||
|
"""Returns all deletion indicators from all platforms."""
|
||||||
|
return (
|
||||||
|
cls.TWITTER
|
||||||
|
+ cls.FACEBOOK
|
||||||
|
+ cls.INSTAGRAM
|
||||||
|
+ cls.TIKTOK
|
||||||
|
+ cls.YOUTUBE
|
||||||
|
+ cls.REDDIT
|
||||||
|
+ cls.VK
|
||||||
|
+ cls.TELEGRAM
|
||||||
|
+ cls.GENERIC
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def for_url(cls, url: str) -> List[str]:
|
||||||
|
"""Returns platform-specific indicators based on URL domain."""
|
||||||
|
platform = _extract_platform(url)
|
||||||
|
|
||||||
|
indicators_map = {
|
||||||
|
"twitter": cls.TWITTER + cls.GENERIC,
|
||||||
|
"facebook": cls.FACEBOOK + cls.GENERIC,
|
||||||
|
"instagram": cls.INSTAGRAM + cls.GENERIC,
|
||||||
|
"tiktok": cls.TIKTOK + cls.GENERIC,
|
||||||
|
"youtube": cls.YOUTUBE + cls.GENERIC,
|
||||||
|
"reddit": cls.REDDIT + cls.GENERIC,
|
||||||
|
"vk": cls.VK + cls.GENERIC,
|
||||||
|
"telegram": cls.TELEGRAM + cls.GENERIC,
|
||||||
|
}
|
||||||
|
return indicators_map.get(platform, cls.GENERIC)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_deletion(
|
||||||
|
html_content: str = None,
|
||||||
|
page_title: str = None,
|
||||||
|
error_message: str = None,
|
||||||
|
url: str = None,
|
||||||
|
video_data: dict = None,
|
||||||
|
) -> Optional[Dict[str, any]]:
|
||||||
|
"""
|
||||||
|
Best-effort deletion detection across multiple signals.
|
||||||
|
|
||||||
|
Checks HTML content, page titles, error messages, and video metadata for
|
||||||
|
indicators that content has been deleted or is unavailable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: Raw HTML source of the page
|
||||||
|
page_title: Browser page title
|
||||||
|
error_message: Any error message from the extractor
|
||||||
|
url: The URL being archived (for platform-specific detection)
|
||||||
|
video_data: Video metadata from yt-dlp or other extractors
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with deletion details if detected, None otherwise.
|
||||||
|
Format: {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "specific text that was found",
|
||||||
|
"source": "html|title|error|metadata",
|
||||||
|
"platform": "twitter|facebook|etc"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Determine indicators to check based on URL
|
||||||
|
if url:
|
||||||
|
indicators = DeletionIndicators.for_url(url)
|
||||||
|
platform = _extract_platform(url)
|
||||||
|
else:
|
||||||
|
indicators = DeletionIndicators.all_indicators()
|
||||||
|
platform = "unknown"
|
||||||
|
|
||||||
|
# Check HTML content
|
||||||
|
if html_content:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in html_content.lower():
|
||||||
|
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
|
||||||
|
|
||||||
|
# Check page title
|
||||||
|
if page_title:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in page_title.lower():
|
||||||
|
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
|
||||||
|
|
||||||
|
# Check error messages
|
||||||
|
if error_message:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in str(error_message).lower():
|
||||||
|
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
|
||||||
|
|
||||||
|
# Check video metadata (from yt-dlp)
|
||||||
|
if video_data:
|
||||||
|
# Check if yt-dlp flagged it as unavailable
|
||||||
|
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
|
||||||
|
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
|
||||||
|
return {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": f"availability: {video_data.get('availability')}",
|
||||||
|
"source": "video_metadata",
|
||||||
|
"platform": platform,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check description/title for deletion indicators
|
||||||
|
for key in ["title", "description", "fulltitle"]:
|
||||||
|
if key in video_data:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in str(video_data[key]).lower():
|
||||||
|
logger.info(f"Deletion detected in {key}: '{indicator}'")
|
||||||
|
return {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": indicator,
|
||||||
|
"source": f"video_metadata_{key}",
|
||||||
|
"platform": platform,
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_platform(url: str) -> str:
|
||||||
|
"""Extracts platform name from URL."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc
|
||||||
|
|
||||||
|
if "twitter.com" in domain or "x.com" in domain:
|
||||||
|
return "twitter"
|
||||||
|
elif "facebook.com" in domain or "fb.com" in domain:
|
||||||
|
return "facebook"
|
||||||
|
elif "instagram.com" in domain:
|
||||||
|
return "instagram"
|
||||||
|
elif "tiktok.com" in domain:
|
||||||
|
return "tiktok"
|
||||||
|
elif "youtube.com" in domain or "youtu.be" in domain:
|
||||||
|
return "youtube"
|
||||||
|
elif "reddit.com" in domain:
|
||||||
|
return "reddit"
|
||||||
|
elif "vk.com" in domain:
|
||||||
|
return "vk"
|
||||||
|
elif "t.me" in domain:
|
||||||
|
return "telegram"
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||||
|
"""
|
||||||
|
Flags metadata object as deleted/unavailable.
|
||||||
|
Adds tentative deletion information to the metadata object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata: Metadata object to update
|
||||||
|
deletion_info: Dictionary from detect_deletion()
|
||||||
|
"""
|
||||||
|
metadata.set("deletion_detected", True)
|
||||||
|
metadata.set("deletion_indicator", deletion_info.get("indicator"))
|
||||||
|
metadata.set("deletion_source", deletion_info.get("source"))
|
||||||
|
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||||
|
metadata.status = "deleted_or_unavailable"
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Content marked as deleted/unavailable: "
|
||||||
|
f"platform={deletion_info.get('platform')}, "
|
||||||
|
f"indicator='{deletion_info.get('indicator')}', "
|
||||||
|
f"source={deletion_info.get('source')}"
|
||||||
|
)
|
||||||
@@ -56,6 +56,19 @@ def test_enrich_sets_metadata(enricher, mocker):
|
|||||||
assert metadata.media == [media1, media2]
|
assert metadata.media == [media1, media2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_no_metadata_selection(enricher, mocker):
|
||||||
|
media1 = mocker.Mock(filename="img1.jpg")
|
||||||
|
media2 = mocker.Mock(filename="img2.jpg")
|
||||||
|
metadata = mocker.Mock()
|
||||||
|
metadata.media = [media1, media2]
|
||||||
|
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
|
||||||
|
enricher.look_for_keys = ["no-key"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
media1.set.assert_called_once_with("metadata", {})
|
||||||
|
media2.set.assert_not_called()
|
||||||
|
assert metadata.media == [media1, media2]
|
||||||
|
|
||||||
|
|
||||||
def test_enrich_empty_media(enricher, mocker):
|
def test_enrich_empty_media(enricher, mocker):
|
||||||
metadata = mocker.Mock()
|
metadata = mocker.Mock()
|
||||||
metadata.media = []
|
metadata.media = []
|
||||||
@@ -71,7 +84,9 @@ def test_get_metadata_error_handling(enricher, mocker):
|
|||||||
assert "Error occurred: " in mock_log.call_args[0][0]
|
assert "Error occurred: " in mock_log.call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_pickle(enricher, unpickle, mocker):
|
# TODO depends on the expected functionality
|
||||||
|
"""
|
||||||
|
def test_default_metadata_pickle(enricher, unpickle, mocker):
|
||||||
mock_run = mocker.patch("subprocess.run")
|
mock_run = mocker.patch("subprocess.run")
|
||||||
# Uses pickled values
|
# Uses pickled values
|
||||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
@@ -79,6 +94,39 @@ def test_metadata_pickle(enricher, unpickle, mocker):
|
|||||||
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
||||||
enricher.enrich(metadata)
|
enricher.enrich(metadata)
|
||||||
expected_media = expected.media
|
expected_media = expected.media
|
||||||
|
print(expected_media)
|
||||||
actual_media = metadata.media
|
actual_media = metadata.media
|
||||||
|
|
||||||
assert len(expected_media) == len(actual_media)
|
assert len(expected_media) == len(actual_media)
|
||||||
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_pickle_megapixel(enricher, unpickle, mocker):
|
||||||
|
mock_run = mocker.patch("subprocess.run")
|
||||||
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
|
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||||
|
|
||||||
|
enricher.look_for_keys = ["megapixels"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
actual_media = metadata.media
|
||||||
|
|
||||||
|
assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker):
|
||||||
|
mock_run = mocker.patch("subprocess.run")
|
||||||
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
|
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||||
|
|
||||||
|
enricher.look_for_keys = ["datetime", "megapixels", "image height"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
actual_media = metadata.media
|
||||||
|
|
||||||
|
assert actual_media[0].properties.get("metadata") == {
|
||||||
|
"File Modification Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"File Access Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"File Inode Change Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"Megapixels": "0.922",
|
||||||
|
"Image Height": "720",
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
|||||||
from .test_extractor_base import TestExtractorBase
|
from .test_extractor_base import TestExtractorBase
|
||||||
|
|
||||||
|
|
||||||
|
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||||
|
|
||||||
|
|
||||||
class DummySB:
|
class DummySB:
|
||||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||||
self._url = url
|
self._url = url
|
||||||
@@ -51,7 +54,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
|
|
||||||
@pytest.mark.download
|
@pytest.mark.download
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url,in_title,in_text,image_count,video_count",
|
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||||
@@ -59,6 +62,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Tyto alba",
|
"Tyto alba",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||||
@@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Bellingcat has geolocated",
|
"Bellingcat has geolocated",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||||
@@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"continued the work of Gazan journalists",
|
"continued the work of Gazan journalists",
|
||||||
5,
|
5,
|
||||||
1,
|
1,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/about/general-information",
|
"https://www.bellingcat.com/about/general-information",
|
||||||
@@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Stichting Bellingcat",
|
"Stichting Bellingcat",
|
||||||
0, # SVGs are ignored
|
0, # SVGs are ignored
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||||
@@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"16 сентября 1985 года лейблом EMI Records.",
|
"16 сентября 1985 года лейблом EMI Records.",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||||
@@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Dito ko lang",
|
"Dito ko lang",
|
||||||
1,
|
1,
|
||||||
0,
|
0,
|
||||||
|
True,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
def test_download_pages_with_media(
|
||||||
|
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Test downloading pages with media.
|
Test downloading pages with media.
|
||||||
"""
|
"""
|
||||||
|
if CI and skip_ci:
|
||||||
|
pytest.skip("Skipping test in CI environment")
|
||||||
|
|
||||||
self.extractor = setup_module(
|
self.extractor = setup_module(
|
||||||
self.extractor_module,
|
self.extractor_module,
|
||||||
self.config
|
self.config
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
|
||||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suitable_extractors(self, url, suitable_extractors):
|
def test_suitable_extractors(self, url, suitable_extractors):
|
||||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
def test_bluesky_download_video(self, make_item):
|
def test_bluesky_download_video(self, make_item):
|
||||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||||
result = self.extractor.download(item)
|
result = self.extractor.download(item)
|
||||||
|
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||||
assert result is not False
|
assert result is not False
|
||||||
|
|
||||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||||
|
|||||||
147
tests/utils/test_deletion_detection.py
Normal file
147
tests/utils/test_deletion_detection.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
Tests for deletion detection utilities.
|
||||||
|
|
||||||
|
These tests verify the current best-effort by the auto-archiver
|
||||||
|
to detect when content has been deleted or is unavailable across
|
||||||
|
various platforms.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
|
||||||
|
from auto_archiver.core.metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeletionIndicators:
|
||||||
|
"""Test the deletion indicator lists for various platforms."""
|
||||||
|
|
||||||
|
def test_twitter_indicators(self):
|
||||||
|
"""Verify Twitter deletion indicators are comprehensive."""
|
||||||
|
assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER
|
||||||
|
assert "Try searching for something else" in DeletionIndicators.TWITTER
|
||||||
|
assert "This Tweet is unavailable" in DeletionIndicators.TWITTER
|
||||||
|
|
||||||
|
def test_platform_specific_indicators(self):
|
||||||
|
"""Test that platform-specific indicators are returned based on URL."""
|
||||||
|
twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123")
|
||||||
|
assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators)
|
||||||
|
|
||||||
|
instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123")
|
||||||
|
assert any("page isn't available" in ind.lower() for ind in instagram_indicators)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetectDeletion:
|
||||||
|
"""Test the detect_deletion function with various inputs."""
|
||||||
|
|
||||||
|
def test_detect_deletion_in_html_twitter(self):
|
||||||
|
"""Test detection of Twitter's deleted post page."""
|
||||||
|
html = "<html><body>Hmm...this page doesn't exist. Try searching for something else.</body></html>"
|
||||||
|
url = "https://twitter.com/user/status/123"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["platform"] == "twitter"
|
||||||
|
assert result["source"] == "html_content"
|
||||||
|
assert "page doesn't exist" in result["indicator"].lower()
|
||||||
|
|
||||||
|
def test_detect_deletion_in_page_title(self):
|
||||||
|
"""Test detection via page title."""
|
||||||
|
title = "Page Not Found"
|
||||||
|
url = "https://facebook.com/post/123"
|
||||||
|
|
||||||
|
result = detect_deletion(page_title=title, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["source"] == "page_title"
|
||||||
|
|
||||||
|
def test_detect_deletion_in_error_message(self):
|
||||||
|
"""Test detection via error messages."""
|
||||||
|
error = "yt_dlp.utils.DownloadError: This video is no longer available"
|
||||||
|
url = "https://youtube.com/watch?v=abc123"
|
||||||
|
|
||||||
|
result = detect_deletion(error_message=error, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["platform"] == "youtube"
|
||||||
|
assert result["source"] == "error_message"
|
||||||
|
|
||||||
|
def test_detect_deletion_in_video_metadata(self):
|
||||||
|
"""Test detection via yt-dlp video metadata."""
|
||||||
|
video_data = {"availability": "unavailable", "title": "Private video"}
|
||||||
|
url = "https://youtube.com/watch?v=test123"
|
||||||
|
|
||||||
|
result = detect_deletion(video_data=video_data, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["source"] == "video_metadata"
|
||||||
|
assert "availability" in result["indicator"]
|
||||||
|
|
||||||
|
def test_no_deletion_detected(self):
|
||||||
|
"""Test that normal content is not flagged as deleted."""
|
||||||
|
html = "<html><body><h1>Welcome to my page</h1><p>This is normal content.</p></body></html>"
|
||||||
|
title = "My Normal Page"
|
||||||
|
url = "https://example.com/page"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, page_title=title, url=url)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_instagram_media_not_found(self):
|
||||||
|
"""Test Instagram-specific deletion message."""
|
||||||
|
error = "Media not found or unavailable"
|
||||||
|
url = "https://instagram.com/p/ABC123"
|
||||||
|
|
||||||
|
result = detect_deletion(error_message=error, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["platform"] == "instagram"
|
||||||
|
assert "not found" in result["indicator"].lower()
|
||||||
|
|
||||||
|
def test_reddit_removed_content(self):
|
||||||
|
"""Test Reddit [removed] and [deleted] markers."""
|
||||||
|
html = "<div class='comment'>[removed]</div>"
|
||||||
|
url = "https://reddit.com/r/test/comments/abc123"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["platform"] == "reddit"
|
||||||
|
|
||||||
|
|
||||||
|
class TestFlagAsDeleted:
|
||||||
|
"""Test the flag_as_deleted function."""
|
||||||
|
|
||||||
|
def test_flag_metadata_as_deleted(self):
|
||||||
|
"""Verify that metadata is properly flagged with deletion info."""
|
||||||
|
metadata = Metadata()
|
||||||
|
deletion_info = {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "This Tweet is unavailable",
|
||||||
|
"source": "html_content",
|
||||||
|
"platform": "twitter",
|
||||||
|
}
|
||||||
|
|
||||||
|
flag_as_deleted(metadata, deletion_info)
|
||||||
|
|
||||||
|
assert metadata.get("deletion_detected") is True
|
||||||
|
assert metadata.get("deletion_indicator") == "This Tweet is unavailable"
|
||||||
|
assert metadata.get("deletion_source") == "html_content"
|
||||||
|
assert metadata.get("deletion_platform") == "twitter"
|
||||||
|
assert metadata.status == "deleted_or_unavailable"
|
||||||
|
|
||||||
|
def test_metadata_contains_deletion_context(self):
|
||||||
|
"""Verify investigators have full context about the deletion."""
|
||||||
|
metadata = Metadata()
|
||||||
|
deletion_info = {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "Video has been removed by the uploader",
|
||||||
|
"source": "error_message",
|
||||||
|
"platform": "youtube",
|
||||||
|
}
|
||||||
|
|
||||||
|
flag_as_deleted(metadata, deletion_info)
|
||||||
|
assert "deletion_indicator" in metadata.metadata
|
||||||
|
assert "uploader" in metadata.get("deletion_indicator")
|
||||||
Reference in New Issue
Block a user