mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Compare commits
35 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
52a7cabaf1 | ||
|
|
a739361e12 | ||
|
|
9a97fede43 | ||
|
|
2d13077fad | ||
|
|
8a4a314cf9 | ||
|
|
75e8b788ae | ||
|
|
defe2315bf | ||
|
|
ba0dffdd5e | ||
|
|
a09927c507 | ||
|
|
6c938c489a | ||
|
|
0e39768da9 | ||
|
|
1e5d6ec4a6 | ||
|
|
3385d004cf | ||
|
|
7f27f7fce0 | ||
|
|
a6e3240af1 | ||
|
|
bf4c196cc2 | ||
|
|
c640cc898a | ||
|
|
3e2c0b564b | ||
|
|
5fd23baa55 | ||
|
|
8a450310c7 | ||
|
|
bef8a14089 | ||
|
|
cd0b093e7a | ||
|
|
096c9d09ef | ||
|
|
df3521e9ca | ||
|
|
a89d0193e4 | ||
|
|
536cbd905f | ||
|
|
a936921c4e | ||
|
|
68f672a4fa | ||
|
|
4ee0ad1cf8 | ||
|
|
bac809451c | ||
|
|
53dc9904ce | ||
|
|
c1f312d42a | ||
|
|
23c9dfe717 | ||
|
|
d02e7e0f02 | ||
|
|
94e0803fb3 |
6
.github/workflows/docker-publish.yaml
vendored
6
.github/workflows/docker-publish.yaml
vendored
@@ -22,7 +22,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v3
|
||||||
@@ -33,14 +33,14 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
- name: Log in to Docker Hub
|
||||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
|
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051
|
||||||
with:
|
with:
|
||||||
images: bellingcat/auto-archiver
|
images: bellingcat/auto-archiver
|
||||||
|
|
||||||
|
|||||||
4
.github/workflows/python-publish.yaml
vendored
4
.github/workflows/python-publish.yaml
vendored
@@ -22,10 +22,10 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repository
|
- name: Checkout Repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version-file: pyproject.toml
|
python-version-file: pyproject.toml
|
||||||
|
|
||||||
|
|||||||
6
.github/workflows/ruff.yaml
vendored
6
.github/workflows/ruff.yaml
vendored
@@ -20,11 +20,11 @@ jobs:
|
|||||||
build:
|
build:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
- name: Install Python
|
- name: Install Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.12"
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
|
|||||||
6
.github/workflows/tests-core.yaml
vendored
6
.github/workflows/tests-core.yaml
vendored
@@ -26,13 +26,13 @@ jobs:
|
|||||||
working-directory: ./
|
working-directory: ./
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Install ffmpeg
|
- name: Install ffmpeg
|
||||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||||
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -40,7 +40,7 @@ jobs:
|
|||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
|
|
||||||
- name: Cache Poetry and pip artifacts
|
- name: Cache Poetry and pip artifacts
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v5
|
||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/.cache/pypoetry
|
~/.cache/pypoetry
|
||||||
|
|||||||
6
.github/workflows/tests-download.yaml
vendored
6
.github/workflows/tests-download.yaml
vendored
@@ -20,13 +20,13 @@ jobs:
|
|||||||
working-directory: ./
|
working-directory: ./
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v6
|
||||||
|
|
||||||
- name: Install ffmpeg
|
- name: Install ffmpeg
|
||||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||||
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v6
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ jobs:
|
|||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
|
|
||||||
- name: Cache Poetry and pip artifacts
|
- name: Cache Poetry and pip artifacts
|
||||||
uses: actions/cache@v4
|
uses: actions/cache@v5
|
||||||
with:
|
with:
|
||||||
path: |
|
path: |
|
||||||
~/.cache/pypoetry
|
~/.cache/pypoetry
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
FROM webrecorder/browsertrix-crawler:1.6.3 AS base
|
FROM webrecorder/browsertrix-crawler:1.11.4 AS base
|
||||||
|
|
||||||
ENV RUNNING_IN_DOCKER=1 \
|
ENV RUNNING_IN_DOCKER=1 \
|
||||||
LANG=C.UTF-8 \
|
LANG=C.UTF-8 \
|
||||||
|
|||||||
1795
poetry.lock
generated
1795
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.1.6"
|
version = "1.2.2"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
@@ -54,11 +54,11 @@ dependencies = [
|
|||||||
"cryptography (>=46.0.3)",
|
"cryptography (>=46.0.3)",
|
||||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
"yt-dlp[curl-cffi,default] (>=2025.5.22)",
|
||||||
"secretstorage (>=3.3.3,<4.0.0)",
|
"secretstorage (>=3.3.3,<4.0.0)",
|
||||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||||
"pyautogui (>=0.9.54,<0.10.0)",
|
"pyautogui (>=0.9.54,<0.10.0)",
|
||||||
"pyperclip (==1.8.2)",
|
"pyperclip (>=1.9.0)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
@@ -66,7 +66,7 @@ pytest = "^8.3.4"
|
|||||||
autopep8 = "^2.3.1"
|
autopep8 = "^2.3.1"
|
||||||
pytest-loguru = "^0.4.0"
|
pytest-loguru = "^0.4.0"
|
||||||
pytest-mock = "^3.14.0"
|
pytest-mock = "^3.14.0"
|
||||||
ruff = "^0.9.10"
|
ruff = "^0.15.2"
|
||||||
pre-commit = "^4.1.0"
|
pre-commit = "^4.1.0"
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
|
|||||||
1106
scripts/settings/package-lock.json
generated
1106
scripts/settings/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
|||||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
from auto_archiver.utils.url import is_relevant_url
|
from auto_archiver.utils.url import is_relevant_url
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
|
|
||||||
|
|
||||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||||
@@ -98,8 +99,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
|
|
||||||
dropin = self._get_suitable_dropin(url, sb)
|
dropin = self._get_suitable_dropin(url, sb)
|
||||||
if not dropin.open_page(url):
|
if not dropin.open_page(url):
|
||||||
# TODO: could we detect deleted videos?
|
# Check for deletion indicators
|
||||||
logger.warning("Failed to open drop-in page")
|
page_title = sb.get_title()
|
||||||
|
html_source = sb.get_page_source()
|
||||||
|
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(to_enrich, deletion_info)
|
||||||
|
return to_enrich
|
||||||
|
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||||
@@ -109,7 +116,15 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
sb.wait_for_ready_state_complete()
|
sb.wait_for_ready_state_complete()
|
||||||
sb.sleep(1) # margin for the page to load completely
|
sb.sleep(1) # margin for the page to load completely
|
||||||
|
|
||||||
to_enrich.set_title(sb.get_title())
|
page_title = sb.get_title()
|
||||||
|
html_source = sb.get_page_source()
|
||||||
|
|
||||||
|
# Check if the page indicates content was deleted
|
||||||
|
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(to_enrich, deletion_info)
|
||||||
|
|
||||||
|
to_enrich.set_title(page_title)
|
||||||
self._enrich_html_source_code(sb, to_enrich)
|
self._enrich_html_source_code(sb, to_enrich)
|
||||||
|
|
||||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||||
|
|||||||
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
*.py
|
||||||
@@ -34,7 +34,7 @@ def _extract_metadata(self, webpage, video_id):
|
|||||||
...,
|
...,
|
||||||
"attachments",
|
"attachments",
|
||||||
...,
|
...,
|
||||||
lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
|
lambda k, v: k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video",
|
||||||
),
|
),
|
||||||
expected_type=dict,
|
expected_type=dict,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
|||||||
from auto_archiver.core import Metadata, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.utils import get_datetime_from_str
|
from auto_archiver.utils import get_datetime_from_str
|
||||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from .dropin import GenericDropin
|
from .dropin import GenericDropin
|
||||||
|
|
||||||
|
|
||||||
@@ -354,7 +355,7 @@ class GenericExtractor(Extractor):
|
|||||||
if not dropin:
|
if not dropin:
|
||||||
# TODO: add a proper link to 'how to create your own dropin'
|
# TODO: add a proper link to 'how to create your own dropin'
|
||||||
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
||||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/generic_extractor.html#dropins""")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
post_data = dropin.extract_post(url, ie_instance)
|
post_data = dropin.extract_post(url, ie_instance)
|
||||||
@@ -484,6 +485,13 @@ class GenericExtractor(Extractor):
|
|||||||
# don't download since it can be a live stream
|
# don't download since it can be a live stream
|
||||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||||
|
|
||||||
|
# Check for deletion indicators in video data
|
||||||
|
deletion_info = detect_deletion(video_data=data, url=url)
|
||||||
|
if deletion_info:
|
||||||
|
result = Metadata()
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||||
|
|
||||||
except MaxDownloadsReached:
|
except MaxDownloadsReached:
|
||||||
@@ -503,6 +511,13 @@ class GenericExtractor(Extractor):
|
|||||||
try:
|
try:
|
||||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||||
|
# Check if the error indicates deletion
|
||||||
|
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||||
|
if deletion_info:
|
||||||
|
result = Metadata()
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
if "NSFW tweet requires authentication." in str(post_e):
|
if "NSFW tweet requires authentication." in str(post_e):
|
||||||
logger.warning(str(post_e))
|
logger.warning(str(post_e))
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from slugify import slugify
|
|||||||
from auto_archiver.core.metadata import Metadata, Media
|
from auto_archiver.core.metadata import Metadata, Media
|
||||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||||
|
|
||||||
|
|
||||||
@@ -37,7 +38,15 @@ class Twitter(GenericDropin):
|
|||||||
result = Metadata()
|
result = Metadata()
|
||||||
try:
|
try:
|
||||||
if not tweet.get("user") or not tweet.get("created_at"):
|
if not tweet.get("user") or not tweet.get("created_at"):
|
||||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
# Check for deletion indicators
|
||||||
|
deletion_info = detect_deletion(
|
||||||
|
video_data=tweet, url=url, error_message="Missing user or created_at fields"
|
||||||
|
)
|
||||||
|
if deletion_info:
|
||||||
|
flag_as_deleted(result, deletion_info)
|
||||||
|
return result
|
||||||
|
|
||||||
|
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||||
except (ValueError, KeyError) as ex:
|
except (ValueError, KeyError) as ex:
|
||||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||||
|
|||||||
@@ -3,6 +3,13 @@
|
|||||||
"type": ["enricher"],
|
"type": ["enricher"],
|
||||||
"requires_setup": True,
|
"requires_setup": True,
|
||||||
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||||
|
"configs": {
|
||||||
|
"look_for_keys": {
|
||||||
|
"default": [],
|
||||||
|
"help": "list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.",
|
||||||
|
"type": "list",
|
||||||
|
},
|
||||||
|
},
|
||||||
"description": """
|
"description": """
|
||||||
Extracts metadata information from files using ExifTool.
|
Extracts metadata information from files using ExifTool.
|
||||||
|
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ class MetadataEnricher(Enricher):
|
|||||||
|
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if len(md := self.get_metadata(m.filename)):
|
if len(md := self.get_metadata(m.filename)):
|
||||||
|
if self.look_for_keys != []:
|
||||||
|
md = self.select_metadata(md, self.look_for_keys)
|
||||||
to_enrich.media[i].set("metadata", md)
|
to_enrich.media[i].set("metadata", md)
|
||||||
|
|
||||||
def get_metadata(self, filename: str) -> dict:
|
def get_metadata(self, filename: str) -> dict:
|
||||||
@@ -23,7 +25,6 @@ class MetadataEnricher(Enricher):
|
|||||||
# Run ExifTool command to extract metadata from the file
|
# Run ExifTool command to extract metadata from the file
|
||||||
cmd = ["exiftool", filename]
|
cmd = ["exiftool", filename]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
# Process the output to extract individual metadata fields
|
# Process the output to extract individual metadata fields
|
||||||
metadata = {}
|
metadata = {}
|
||||||
for line in result.stdout.splitlines():
|
for line in result.stdout.splitlines():
|
||||||
@@ -35,3 +36,33 @@ class MetadataEnricher(Enricher):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def select_metadata(self, all_md, requested_metadata_keys):
|
||||||
|
"""
|
||||||
|
coordinates the selection of metadata from the general exiftool output to the user-specified grocery list
|
||||||
|
"""
|
||||||
|
# defining the batches of metadata that get pulled for special terms
|
||||||
|
author_key_terms = ["author", "producer", "creator"]
|
||||||
|
datetime_key_terms = ["date", "time"]
|
||||||
|
location_key_terms = ["gps", "latitude", "longitude"]
|
||||||
|
|
||||||
|
specified_md = {}
|
||||||
|
for md_key in all_md.keys():
|
||||||
|
md_key_lower = md_key.lower()
|
||||||
|
# checking for special baskets within the grocery list of requested metadata
|
||||||
|
if ("author" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in author_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
if ("datetime" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
if ("location" in requested_metadata_keys) and any(
|
||||||
|
term in md_key_lower and len(all_md[md_key]) for term in location_key_terms
|
||||||
|
):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
# if the metadata value is requested directly
|
||||||
|
if md_key_lower in requested_metadata_keys or md_key in requested_metadata_keys and len(all_md[md_key]):
|
||||||
|
specified_md[md_key] = all_md[md_key]
|
||||||
|
return specified_md
|
||||||
|
|||||||
@@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||||
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||||
|
|
||||||
self.crawl_id = random_str(8)
|
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
||||||
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
|
|
||||||
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||||
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||||
@@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
|
|
||||||
url = to_enrich.get_url()
|
url = to_enrich.get_url()
|
||||||
|
|
||||||
collection = self.crawl_id
|
crawl_id = random_str(8)
|
||||||
|
collection = crawl_id
|
||||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||||
|
|
||||||
@@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
||||||
]
|
]
|
||||||
|
|
||||||
|
crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id)
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
cmd.extend(["--cwd", self.cwd_dind])
|
os.makedirs(crawl_cwd_dind, exist_ok=True)
|
||||||
|
cmd.extend(["--cwd", crawl_cwd_dind])
|
||||||
|
|
||||||
if self.auth_for_site(url):
|
if self.auth_for_site(url):
|
||||||
# there's an auth for this site, but browsertrix only supports username/password auth
|
# there's an auth for this site, but browsertrix only supports username/password auth
|
||||||
@@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
] + cmd
|
] + cmd
|
||||||
|
|
||||||
if self.profile:
|
if self.profile:
|
||||||
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
profile_file = f"profile-{crawl_id}.tar.gz"
|
||||||
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||||
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
||||||
shutil.copyfile(self.profile, profile_fn)
|
shutil.copyfile(self.profile, profile_fn)
|
||||||
@@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz")
|
||||||
elif self.use_docker:
|
elif self.use_docker:
|
||||||
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
||||||
else:
|
else:
|
||||||
@@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
|||||||
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
||||||
|
|
||||||
if self.docker_in_docker:
|
if self.docker_in_docker:
|
||||||
jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
||||||
elif self.use_docker:
|
elif self.use_docker:
|
||||||
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -2,6 +2,13 @@ from loguru import logger
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def type_serializer(obj):
|
||||||
|
"""Fallback function for objects json can't handle."""
|
||||||
|
if isinstance(obj, type):
|
||||||
|
return obj.__name__
|
||||||
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
def extract_location(record, short=False):
|
def extract_location(record, short=False):
|
||||||
"""Extracts the file name, function name, and line number from the log record."""
|
"""Extracts the file name, function name, and line number from the log record."""
|
||||||
if short:
|
if short:
|
||||||
@@ -35,11 +42,11 @@ def serialize_for_console(record):
|
|||||||
subset.pop("time", None)
|
subset.pop("time", None)
|
||||||
if not subset:
|
if not subset:
|
||||||
return ""
|
return ""
|
||||||
return json.dumps(subset, ensure_ascii=False)
|
return json.dumps(subset, ensure_ascii=False, default=type_serializer)
|
||||||
|
|
||||||
|
|
||||||
def serialize(record):
|
def serialize(record):
|
||||||
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
return json.dumps(extract_log_data(record), ensure_ascii=False, default=type_serializer)
|
||||||
|
|
||||||
|
|
||||||
def patching(record):
|
def patching(record):
|
||||||
|
|||||||
273
src/auto_archiver/utils/deletion_detection.py
Normal file
273
src/auto_archiver/utils/deletion_detection.py
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
"""
|
||||||
|
Deletion Detection Utilities
|
||||||
|
|
||||||
|
Provides a best-effort detection of deleted, missing, or unavailable content
|
||||||
|
across various social media platforms based on presence of expected keywords.
|
||||||
|
|
||||||
|
This module helps identify removed content, helps to:
|
||||||
|
- Document content that existed but was deleted
|
||||||
|
- Track patterns of content removal
|
||||||
|
- Preserve metadata about missing content
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional, Dict, List
|
||||||
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
class DeletionIndicators:
|
||||||
|
"""
|
||||||
|
Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Twitter/X deletion indicators
|
||||||
|
TWITTER = [
|
||||||
|
"Hmm...this page doesn't exist",
|
||||||
|
"Try searching for something else",
|
||||||
|
"This Tweet is unavailable",
|
||||||
|
"This account doesn't exist",
|
||||||
|
"This Tweet has been deleted",
|
||||||
|
"This account has been suspended",
|
||||||
|
"Sorry, that page doesn't exist",
|
||||||
|
"The Tweet you're looking for isn't available",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Facebook deletion indicators
|
||||||
|
FACEBOOK = [
|
||||||
|
"This content isn't available",
|
||||||
|
"Sorry, this content isn't available",
|
||||||
|
"This content is no longer available",
|
||||||
|
"The link you followed may be broken",
|
||||||
|
"Page Not Found",
|
||||||
|
"Content Not Found",
|
||||||
|
"This content is no longer on Facebook",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Instagram deletion indicators
|
||||||
|
INSTAGRAM = [
|
||||||
|
"Sorry, this page isn't available",
|
||||||
|
"The link you followed may be broken",
|
||||||
|
"Media not found or unavailable",
|
||||||
|
"This post is no longer available",
|
||||||
|
"This account is private",
|
||||||
|
]
|
||||||
|
|
||||||
|
# TikTok deletion indicators
|
||||||
|
TIKTOK = [
|
||||||
|
"Couldn't find this account",
|
||||||
|
"This video is no longer available",
|
||||||
|
"This video is currently unavailable",
|
||||||
|
"Video not found",
|
||||||
|
"This video may have been deleted",
|
||||||
|
]
|
||||||
|
|
||||||
|
# YouTube deletion indicators
|
||||||
|
YOUTUBE = [
|
||||||
|
"This video isn't available anymore",
|
||||||
|
"Video unavailable",
|
||||||
|
"This video has been removed",
|
||||||
|
"This video is no longer available",
|
||||||
|
"This video is private",
|
||||||
|
"This video has been removed by the uploader",
|
||||||
|
"This video has been deleted",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reddit deletion indicators
|
||||||
|
REDDIT = [
|
||||||
|
"this post has been removed",
|
||||||
|
"this comment has been removed",
|
||||||
|
"[removed]",
|
||||||
|
"[deleted]",
|
||||||
|
"page not found",
|
||||||
|
"there doesn't seem to be anything here",
|
||||||
|
]
|
||||||
|
|
||||||
|
# VK deletion indicators
|
||||||
|
VK = [
|
||||||
|
"Post deleted",
|
||||||
|
"Page not found",
|
||||||
|
"Content unavailable",
|
||||||
|
"Access denied",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Telegram deletion indicators
|
||||||
|
TELEGRAM = [
|
||||||
|
"Message not found",
|
||||||
|
"Deleted message",
|
||||||
|
"Channel is private",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Generic indicators (work across platforms)
|
||||||
|
GENERIC = [
|
||||||
|
"has been removed",
|
||||||
|
"no longer available",
|
||||||
|
"content removed",
|
||||||
|
"access denied",
|
||||||
|
"page not found",
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def all_indicators(cls) -> List[str]:
|
||||||
|
"""Returns all deletion indicators from all platforms."""
|
||||||
|
return (
|
||||||
|
cls.TWITTER
|
||||||
|
+ cls.FACEBOOK
|
||||||
|
+ cls.INSTAGRAM
|
||||||
|
+ cls.TIKTOK
|
||||||
|
+ cls.YOUTUBE
|
||||||
|
+ cls.REDDIT
|
||||||
|
+ cls.VK
|
||||||
|
+ cls.TELEGRAM
|
||||||
|
+ cls.GENERIC
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def for_url(cls, url: str) -> List[str]:
|
||||||
|
"""Returns platform-specific indicators based on URL domain."""
|
||||||
|
platform = _extract_platform(url)
|
||||||
|
|
||||||
|
indicators_map = {
|
||||||
|
"twitter": cls.TWITTER + cls.GENERIC,
|
||||||
|
"facebook": cls.FACEBOOK + cls.GENERIC,
|
||||||
|
"instagram": cls.INSTAGRAM + cls.GENERIC,
|
||||||
|
"tiktok": cls.TIKTOK + cls.GENERIC,
|
||||||
|
"youtube": cls.YOUTUBE + cls.GENERIC,
|
||||||
|
"reddit": cls.REDDIT + cls.GENERIC,
|
||||||
|
"vk": cls.VK + cls.GENERIC,
|
||||||
|
"telegram": cls.TELEGRAM + cls.GENERIC,
|
||||||
|
}
|
||||||
|
return indicators_map.get(platform, cls.GENERIC)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_deletion(
|
||||||
|
html_content: str = None,
|
||||||
|
page_title: str = None,
|
||||||
|
error_message: str = None,
|
||||||
|
url: str = None,
|
||||||
|
video_data: dict = None,
|
||||||
|
) -> Optional[Dict[str, any]]:
|
||||||
|
"""
|
||||||
|
Best-effort deletion detection across multiple signals.
|
||||||
|
|
||||||
|
Checks HTML content, page titles, error messages, and video metadata for
|
||||||
|
indicators that content has been deleted or is unavailable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content: Raw HTML source of the page
|
||||||
|
page_title: Browser page title
|
||||||
|
error_message: Any error message from the extractor
|
||||||
|
url: The URL being archived (for platform-specific detection)
|
||||||
|
video_data: Video metadata from yt-dlp or other extractors
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with deletion details if detected, None otherwise.
|
||||||
|
Format: {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "specific text that was found",
|
||||||
|
"source": "html|title|error|metadata",
|
||||||
|
"platform": "twitter|facebook|etc"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Determine indicators to check based on URL
|
||||||
|
if url:
|
||||||
|
indicators = DeletionIndicators.for_url(url)
|
||||||
|
platform = _extract_platform(url)
|
||||||
|
else:
|
||||||
|
indicators = DeletionIndicators.all_indicators()
|
||||||
|
platform = "unknown"
|
||||||
|
|
||||||
|
# Check HTML content
|
||||||
|
if html_content:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in html_content.lower():
|
||||||
|
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
|
||||||
|
|
||||||
|
# Check page title
|
||||||
|
if page_title:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in page_title.lower():
|
||||||
|
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
|
||||||
|
|
||||||
|
# Check error messages
|
||||||
|
if error_message:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in str(error_message).lower():
|
||||||
|
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||||
|
return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
|
||||||
|
|
||||||
|
# Check video metadata (from yt-dlp)
|
||||||
|
if video_data:
|
||||||
|
# Check if yt-dlp flagged it as unavailable
|
||||||
|
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
|
||||||
|
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
|
||||||
|
return {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": f"availability: {video_data.get('availability')}",
|
||||||
|
"source": "video_metadata",
|
||||||
|
"platform": platform,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check description/title for deletion indicators
|
||||||
|
for key in ["title", "description", "fulltitle"]:
|
||||||
|
if key in video_data:
|
||||||
|
for indicator in indicators:
|
||||||
|
if indicator.lower() in str(video_data[key]).lower():
|
||||||
|
logger.info(f"Deletion detected in {key}: '{indicator}'")
|
||||||
|
return {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": indicator,
|
||||||
|
"source": f"video_metadata_{key}",
|
||||||
|
"platform": platform,
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_platform(url: str) -> str:
|
||||||
|
"""Extracts platform name from URL."""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
domain = parsed.netloc
|
||||||
|
|
||||||
|
if "twitter.com" in domain or "x.com" in domain:
|
||||||
|
return "twitter"
|
||||||
|
elif "facebook.com" in domain or "fb.com" in domain:
|
||||||
|
return "facebook"
|
||||||
|
elif "instagram.com" in domain:
|
||||||
|
return "instagram"
|
||||||
|
elif "tiktok.com" in domain:
|
||||||
|
return "tiktok"
|
||||||
|
elif "youtube.com" in domain or "youtu.be" in domain:
|
||||||
|
return "youtube"
|
||||||
|
elif "reddit.com" in domain:
|
||||||
|
return "reddit"
|
||||||
|
elif "vk.com" in domain:
|
||||||
|
return "vk"
|
||||||
|
elif "t.me" in domain:
|
||||||
|
return "telegram"
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||||
|
"""
|
||||||
|
Flags metadata object as deleted/unavailable.
|
||||||
|
Adds tentative deletion information to the metadata object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata: Metadata object to update
|
||||||
|
deletion_info: Dictionary from detect_deletion()
|
||||||
|
"""
|
||||||
|
metadata.set("deletion_detected", True)
|
||||||
|
metadata.set("deletion_indicator", deletion_info.get("indicator"))
|
||||||
|
metadata.set("deletion_source", deletion_info.get("source"))
|
||||||
|
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||||
|
metadata.status = "deleted_or_unavailable"
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Content marked as deleted/unavailable: "
|
||||||
|
f"platform={deletion_info.get('platform')}, "
|
||||||
|
f"indicator='{deletion_info.get('indicator')}', "
|
||||||
|
f"source={deletion_info.get('source')}"
|
||||||
|
)
|
||||||
@@ -56,6 +56,19 @@ def test_enrich_sets_metadata(enricher, mocker):
|
|||||||
assert metadata.media == [media1, media2]
|
assert metadata.media == [media1, media2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_no_metadata_selection(enricher, mocker):
|
||||||
|
media1 = mocker.Mock(filename="img1.jpg")
|
||||||
|
media2 = mocker.Mock(filename="img2.jpg")
|
||||||
|
metadata = mocker.Mock()
|
||||||
|
metadata.media = [media1, media2]
|
||||||
|
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
|
||||||
|
enricher.look_for_keys = ["no-key"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
media1.set.assert_called_once_with("metadata", {})
|
||||||
|
media2.set.assert_not_called()
|
||||||
|
assert metadata.media == [media1, media2]
|
||||||
|
|
||||||
|
|
||||||
def test_enrich_empty_media(enricher, mocker):
|
def test_enrich_empty_media(enricher, mocker):
|
||||||
metadata = mocker.Mock()
|
metadata = mocker.Mock()
|
||||||
metadata.media = []
|
metadata.media = []
|
||||||
@@ -71,7 +84,9 @@ def test_get_metadata_error_handling(enricher, mocker):
|
|||||||
assert "Error occurred: " in mock_log.call_args[0][0]
|
assert "Error occurred: " in mock_log.call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_pickle(enricher, unpickle, mocker):
|
# TODO depends on the expected functionality
|
||||||
|
"""
|
||||||
|
def test_default_metadata_pickle(enricher, unpickle, mocker):
|
||||||
mock_run = mocker.patch("subprocess.run")
|
mock_run = mocker.patch("subprocess.run")
|
||||||
# Uses pickled values
|
# Uses pickled values
|
||||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
@@ -79,6 +94,39 @@ def test_metadata_pickle(enricher, unpickle, mocker):
|
|||||||
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
||||||
enricher.enrich(metadata)
|
enricher.enrich(metadata)
|
||||||
expected_media = expected.media
|
expected_media = expected.media
|
||||||
|
print(expected_media)
|
||||||
actual_media = metadata.media
|
actual_media = metadata.media
|
||||||
|
|
||||||
assert len(expected_media) == len(actual_media)
|
assert len(expected_media) == len(actual_media)
|
||||||
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_pickle_megapixel(enricher, unpickle, mocker):
|
||||||
|
mock_run = mocker.patch("subprocess.run")
|
||||||
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
|
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||||
|
|
||||||
|
enricher.look_for_keys = ["megapixels"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
actual_media = metadata.media
|
||||||
|
|
||||||
|
assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker):
|
||||||
|
mock_run = mocker.patch("subprocess.run")
|
||||||
|
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||||
|
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||||
|
|
||||||
|
enricher.look_for_keys = ["datetime", "megapixels", "image height"]
|
||||||
|
enricher.enrich(metadata)
|
||||||
|
actual_media = metadata.media
|
||||||
|
|
||||||
|
assert actual_media[0].properties.get("metadata") == {
|
||||||
|
"File Modification Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"File Access Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"File Inode Change Date/Time": "2025:02:18 19:42:50+00:00",
|
||||||
|
"Megapixels": "0.922",
|
||||||
|
"Image Height": "720",
|
||||||
|
}
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||||
"western barn owl",
|
"western barn owl",
|
||||||
"Tyto alba",
|
"Tyto alba",
|
||||||
4,
|
5,
|
||||||
0,
|
0,
|
||||||
False,
|
False,
|
||||||
),
|
),
|
||||||
|
|||||||
147
tests/utils/test_deletion_detection.py
Normal file
147
tests/utils/test_deletion_detection.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
Tests for deletion detection utilities.
|
||||||
|
|
||||||
|
These tests verify the current best-effort by the auto-archiver
|
||||||
|
to detect when content has been deleted or is unavailable across
|
||||||
|
various platforms.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
|
||||||
|
from auto_archiver.core.metadata import Metadata
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeletionIndicators:
|
||||||
|
"""Test the deletion indicator lists for various platforms."""
|
||||||
|
|
||||||
|
def test_twitter_indicators(self):
|
||||||
|
"""Verify Twitter deletion indicators are comprehensive."""
|
||||||
|
assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER
|
||||||
|
assert "Try searching for something else" in DeletionIndicators.TWITTER
|
||||||
|
assert "This Tweet is unavailable" in DeletionIndicators.TWITTER
|
||||||
|
|
||||||
|
def test_platform_specific_indicators(self):
|
||||||
|
"""Test that platform-specific indicators are returned based on URL."""
|
||||||
|
twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123")
|
||||||
|
assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators)
|
||||||
|
|
||||||
|
instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123")
|
||||||
|
assert any("page isn't available" in ind.lower() for ind in instagram_indicators)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDetectDeletion:
|
||||||
|
"""Test the detect_deletion function with various inputs."""
|
||||||
|
|
||||||
|
def test_detect_deletion_in_html_twitter(self):
|
||||||
|
"""Test detection of Twitter's deleted post page."""
|
||||||
|
html = "<html><body>Hmm...this page doesn't exist. Try searching for something else.</body></html>"
|
||||||
|
url = "https://twitter.com/user/status/123"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["platform"] == "twitter"
|
||||||
|
assert result["source"] == "html_content"
|
||||||
|
assert "page doesn't exist" in result["indicator"].lower()
|
||||||
|
|
||||||
|
def test_detect_deletion_in_page_title(self):
|
||||||
|
"""Test detection via page title."""
|
||||||
|
title = "Page Not Found"
|
||||||
|
url = "https://facebook.com/post/123"
|
||||||
|
|
||||||
|
result = detect_deletion(page_title=title, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["source"] == "page_title"
|
||||||
|
|
||||||
|
def test_detect_deletion_in_error_message(self):
|
||||||
|
"""Test detection via error messages."""
|
||||||
|
error = "yt_dlp.utils.DownloadError: This video is no longer available"
|
||||||
|
url = "https://youtube.com/watch?v=abc123"
|
||||||
|
|
||||||
|
result = detect_deletion(error_message=error, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["platform"] == "youtube"
|
||||||
|
assert result["source"] == "error_message"
|
||||||
|
|
||||||
|
def test_detect_deletion_in_video_metadata(self):
|
||||||
|
"""Test detection via yt-dlp video metadata."""
|
||||||
|
video_data = {"availability": "unavailable", "title": "Private video"}
|
||||||
|
url = "https://youtube.com/watch?v=test123"
|
||||||
|
|
||||||
|
result = detect_deletion(video_data=video_data, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["is_deleted"] is True
|
||||||
|
assert result["source"] == "video_metadata"
|
||||||
|
assert "availability" in result["indicator"]
|
||||||
|
|
||||||
|
def test_no_deletion_detected(self):
|
||||||
|
"""Test that normal content is not flagged as deleted."""
|
||||||
|
html = "<html><body><h1>Welcome to my page</h1><p>This is normal content.</p></body></html>"
|
||||||
|
title = "My Normal Page"
|
||||||
|
url = "https://example.com/page"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, page_title=title, url=url)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_instagram_media_not_found(self):
|
||||||
|
"""Test Instagram-specific deletion message."""
|
||||||
|
error = "Media not found or unavailable"
|
||||||
|
url = "https://instagram.com/p/ABC123"
|
||||||
|
|
||||||
|
result = detect_deletion(error_message=error, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["platform"] == "instagram"
|
||||||
|
assert "not found" in result["indicator"].lower()
|
||||||
|
|
||||||
|
def test_reddit_removed_content(self):
|
||||||
|
"""Test Reddit [removed] and [deleted] markers."""
|
||||||
|
html = "<div class='comment'>[removed]</div>"
|
||||||
|
url = "https://reddit.com/r/test/comments/abc123"
|
||||||
|
|
||||||
|
result = detect_deletion(html_content=html, url=url)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert result["platform"] == "reddit"
|
||||||
|
|
||||||
|
|
||||||
|
class TestFlagAsDeleted:
|
||||||
|
"""Test the flag_as_deleted function."""
|
||||||
|
|
||||||
|
def test_flag_metadata_as_deleted(self):
|
||||||
|
"""Verify that metadata is properly flagged with deletion info."""
|
||||||
|
metadata = Metadata()
|
||||||
|
deletion_info = {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "This Tweet is unavailable",
|
||||||
|
"source": "html_content",
|
||||||
|
"platform": "twitter",
|
||||||
|
}
|
||||||
|
|
||||||
|
flag_as_deleted(metadata, deletion_info)
|
||||||
|
|
||||||
|
assert metadata.get("deletion_detected") is True
|
||||||
|
assert metadata.get("deletion_indicator") == "This Tweet is unavailable"
|
||||||
|
assert metadata.get("deletion_source") == "html_content"
|
||||||
|
assert metadata.get("deletion_platform") == "twitter"
|
||||||
|
assert metadata.status == "deleted_or_unavailable"
|
||||||
|
|
||||||
|
def test_metadata_contains_deletion_context(self):
|
||||||
|
"""Verify investigators have full context about the deletion."""
|
||||||
|
metadata = Metadata()
|
||||||
|
deletion_info = {
|
||||||
|
"is_deleted": True,
|
||||||
|
"indicator": "Video has been removed by the uploader",
|
||||||
|
"source": "error_message",
|
||||||
|
"platform": "youtube",
|
||||||
|
}
|
||||||
|
|
||||||
|
flag_as_deleted(metadata, deletion_info)
|
||||||
|
assert "deletion_indicator" in metadata.metadata
|
||||||
|
assert "uploader" in metadata.get("deletion_indicator")
|
||||||
Reference in New Issue
Block a user