mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-07-03 23:28:36 +03:00
Compare commits
11 Commits
v1.2.6
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d8d7c2335f | ||
|
|
afbe4fac50 | ||
|
|
e633be1721 | ||
|
|
bc06de8e5c | ||
|
|
20fddce3a3 | ||
|
|
6efa439cdb | ||
|
|
ef77d1fc86 | ||
|
|
a57a5ee005 | ||
|
|
2582f567ac | ||
|
|
4e5c1a6218 | ||
|
|
12d9c469b2 |
@@ -4,15 +4,14 @@ ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
LC_ALL=C.UTF-8 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONFAULTHANDLER=1 \
|
||||
PATH="/root/.local/bin:$PATH"
|
||||
PYTHONFAULTHANDLER=1
|
||||
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
# Installing system dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool python3-tk
|
||||
apt-get install -y --no-install-recommends gcc ffmpeg fonts-noto exiftool python3-tk
|
||||
|
||||
# Poetry and runtime
|
||||
FROM base AS runtime
|
||||
|
||||
643
poetry.lock
generated
643
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "1.2.6"
|
||||
version = "1.2.7"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
@@ -27,20 +27,20 @@ dependencies = [
|
||||
"bs4 (>=0.0.0)",
|
||||
"loguru (>=0.0.0)",
|
||||
"ffmpeg-python (>=0.0.0)",
|
||||
"telethon (>=0.0.0)",
|
||||
"google-api-python-client (>=0.0.0)",
|
||||
"google-auth-httplib2 (>=0.0.0)",
|
||||
"google-auth-oauthlib (>=0.0.0)",
|
||||
"telethon (>=1.44.0)",
|
||||
"google-api-python-client (>=2.198.0)",
|
||||
"google-auth-httplib2 (>=0.4.0)",
|
||||
"google-auth-oauthlib (>=1.4.0)",
|
||||
"oauth2client (>=0.0.0)",
|
||||
"pdqhash (>=0.0.0)",
|
||||
"pillow (>=0.0.0)",
|
||||
"pillow (>=12.3.0)",
|
||||
"python-slugify (>=0.0.0)",
|
||||
"dateparser (>=0.0.0)",
|
||||
"dateparser (>=1.4.1)",
|
||||
"python-twitter-v2 (>=0.0.0)",
|
||||
"instaloader (>=0.0.0)",
|
||||
"tqdm (>=0.0.0)",
|
||||
"tqdm (>=4.68.3)",
|
||||
"jinja2 (>=0.0.0)",
|
||||
"boto3 (>=1.28.0,<2.0.0)",
|
||||
"boto3 (>=1.43.39,<2.0.0)",
|
||||
"dataclasses-json (>=0.0.0)",
|
||||
"numpy (==2.1.3)",
|
||||
"requests[socks] (>=0.0.0)",
|
||||
@@ -48,13 +48,13 @@ dependencies = [
|
||||
"jsonlines (>=0.0.0)",
|
||||
"pysubs2 (>=0.0.0)",
|
||||
"retrying (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"rich-argparse (>=1.8.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
"rfc3161-client (>=1.0.5)",
|
||||
"cryptography (>=46.0.3)",
|
||||
"cryptography (>=49.0.0)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22)",
|
||||
"yt-dlp[curl-cffi,default] (>=2026.6.9)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||
"pyautogui (>=0.9.54,<0.10.0)",
|
||||
@@ -64,15 +64,15 @@ dependencies = [
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.3.4"
|
||||
autopep8 = "^2.3.1"
|
||||
pytest-loguru = "^0.4.0"
|
||||
pytest-loguru = "^0.4.1"
|
||||
pytest-mock = "^3.14.0"
|
||||
ruff = "^0.15.2"
|
||||
ruff = "^0.15.20"
|
||||
pre-commit = "^4.1.0"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
sphinx = "^8.1.3"
|
||||
sphinx-autoapi = "^3.4.0"
|
||||
sphinxcontrib-mermaid = "^1.0.0"
|
||||
sphinxcontrib-mermaid = "^2.0.2"
|
||||
sphinx-autobuild = "^2024.10.3"
|
||||
sphinx-copybutton = "^0.5.2"
|
||||
myst-parser = "^4.0.0"
|
||||
|
||||
@@ -11,6 +11,7 @@ Key Functionalities:
|
||||
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
import os
|
||||
from typing import Any, List, Union, Dict
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses_json import dataclass_json
|
||||
@@ -186,6 +187,9 @@ class Metadata:
|
||||
continue
|
||||
h = m.get("hash")
|
||||
if not h:
|
||||
if not os.path.exists(m.filename):
|
||||
logger.warning(f"Skipping missing media file: {m.filename}")
|
||||
continue
|
||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
if len(h) and h in media_hashes:
|
||||
continue
|
||||
|
||||
@@ -467,7 +467,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
try:
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json", timeout=10).json()
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to check for updates: {e}")
|
||||
return
|
||||
latest_version = version.parse(response["info"]["version"])
|
||||
current_version = version.parse(__version__)
|
||||
# check version compared to current version
|
||||
|
||||
@@ -575,6 +575,8 @@ class GenericExtractor(Extractor):
|
||||
"--live-from-start" if self.live_from_start else "--no-live-from-start",
|
||||
"--postprocessor-args",
|
||||
"ffmpeg:-bitexact", # ensure bitexact output to avoid mismatching hashes for same video
|
||||
"--js-runtimes",
|
||||
"node", # yt-dlp defaults to deno-only; node is available in the base image
|
||||
]
|
||||
|
||||
# proxy handling
|
||||
|
||||
@@ -120,6 +120,9 @@ def ydl_entry_to_filename(ydl, entry: dict) -> str:
|
||||
directory = os.path.dirname(base_filename) # '/get/path/to'
|
||||
basename = os.path.basename(base_filename) # 'file'
|
||||
for f in os.listdir(directory):
|
||||
# skip incomplete downloads left behind by yt-dlp
|
||||
if f.endswith(".part"):
|
||||
continue
|
||||
if (
|
||||
f.startswith(basename)
|
||||
or (entry_url and os.path.splitext(f)[0] in entry_url)
|
||||
|
||||
@@ -86,6 +86,22 @@ def test_media_management(basic_metadata, media_file):
|
||||
assert basic_metadata.get_media_by_id("m1") == media1
|
||||
|
||||
|
||||
def test_remove_duplicate_skips_missing_files(basic_metadata, media_file, tmp_path):
|
||||
"""Missing files should be dropped instead of crashing with FileNotFoundError."""
|
||||
real_file = tmp_path / "exists.txt"
|
||||
real_file.write_text("content")
|
||||
valid = media_file(filename=str(real_file), hash_value="abc")
|
||||
missing = media_file(filename="/nonexistent/path/gone.mp4")
|
||||
|
||||
basic_metadata.add_media(valid, "valid")
|
||||
basic_metadata.add_media(missing, "missing")
|
||||
|
||||
assert len(basic_metadata.media) == 2
|
||||
basic_metadata.remove_duplicate_media_by_hash()
|
||||
assert len(basic_metadata.media) == 1
|
||||
assert basic_metadata.get_media_by_id("valid") == valid
|
||||
|
||||
|
||||
def test_success():
|
||||
m = Metadata()
|
||||
assert not m.is_success()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import pytest
|
||||
from argparse import ArgumentParser, ArgumentTypeError
|
||||
from requests.exceptions import SSLError
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
from auto_archiver.version import __version__
|
||||
from auto_archiver.core.config import read_yaml, store_yaml
|
||||
@@ -256,3 +257,34 @@ def test_load_failed_extractor_cleanup(test_args, mocker, caplog):
|
||||
assert "Error during setup of modules: Test exception" in caplog.text
|
||||
# make sure the 'cleanup' is called
|
||||
assert "cleanup" in caplog.text
|
||||
|
||||
|
||||
def test_check_for_updates_ssl_error(orchestrator, mocker):
|
||||
"""check_for_updates should not raise when the HTTP request fails."""
|
||||
mocker.patch(
|
||||
"auto_archiver.core.orchestrator.requests.get",
|
||||
side_effect=SSLError("SSL handshake failed"),
|
||||
)
|
||||
# should not raise
|
||||
orchestrator.check_for_updates()
|
||||
|
||||
|
||||
def test_check_for_updates_timeout(orchestrator, mocker):
|
||||
"""check_for_updates should not raise on connection timeout."""
|
||||
from requests.exceptions import ConnectionError
|
||||
|
||||
mocker.patch(
|
||||
"auto_archiver.core.orchestrator.requests.get",
|
||||
side_effect=ConnectionError("Connection refused"),
|
||||
)
|
||||
orchestrator.check_for_updates()
|
||||
|
||||
|
||||
def test_check_for_updates_new_version_available(orchestrator, mocker):
|
||||
"""check_for_updates should not raise when a newer version exists."""
|
||||
mocker.patch(
|
||||
"auto_archiver.core.orchestrator.requests.get",
|
||||
return_value=mocker.Mock(json=lambda: {"info": {"version": "99.0.0"}}),
|
||||
)
|
||||
# should complete without error
|
||||
orchestrator.check_for_updates()
|
||||
|
||||
@@ -14,6 +14,7 @@ from auto_archiver.utils.misc import (
|
||||
calculate_file_hash,
|
||||
random_str,
|
||||
get_timestamp,
|
||||
ydl_entry_to_filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -139,3 +140,47 @@ class TestMiscUtils:
|
||||
|
||||
def test_invalid_timestamp_returns_none(self):
|
||||
assert get_timestamp("invalid-date") is None
|
||||
|
||||
|
||||
class TestYdlEntryToFilename:
|
||||
"""Tests for ydl_entry_to_filename, especially .part file filtering."""
|
||||
|
||||
def _make_mock_ydl(self, prepared_filename):
|
||||
class MockYDL:
|
||||
def prepare_filename(self, entry):
|
||||
return prepared_filename
|
||||
|
||||
return MockYDL()
|
||||
|
||||
def test_returns_exact_file_if_exists(self, tmp_path):
|
||||
video = tmp_path / "video.mp4"
|
||||
video.write_bytes(b"data")
|
||||
ydl = self._make_mock_ydl(str(video))
|
||||
assert ydl_entry_to_filename(ydl, {}) == str(video)
|
||||
|
||||
def test_skips_part_file_returns_complete(self, tmp_path):
|
||||
"""Simulates yt-dlp leaving a .part file from a failed format
|
||||
while a complete .webm exists."""
|
||||
(tmp_path / "f5U3IKfoSYs.f399.mp4.part").write_bytes(b"incomplete")
|
||||
webm = tmp_path / "f5U3IKfoSYs.webm"
|
||||
webm.write_bytes(b"complete video")
|
||||
|
||||
# ydl.prepare_filename returns the expected .mp4 which doesn't exist
|
||||
ydl = self._make_mock_ydl(str(tmp_path / "f5U3IKfoSYs.mp4"))
|
||||
result = ydl_entry_to_filename(ydl, {})
|
||||
|
||||
assert result == str(webm)
|
||||
assert not result.endswith(".part")
|
||||
|
||||
def test_skips_part_file_returns_false_if_no_other_match(self, tmp_path):
|
||||
"""Only a .part file exists — should return False."""
|
||||
(tmp_path / "video.f399.mp4.part").write_bytes(b"incomplete")
|
||||
|
||||
ydl = self._make_mock_ydl(str(tmp_path / "video.mp4"))
|
||||
assert ydl_entry_to_filename(ydl, {}) is False
|
||||
|
||||
def test_returns_false_when_no_files_match(self, tmp_path):
|
||||
(tmp_path / "unrelated.txt").write_bytes(b"data")
|
||||
|
||||
ydl = self._make_mock_ydl(str(tmp_path / "video.mp4"))
|
||||
assert ydl_entry_to_filename(ydl, {}) is False
|
||||
|
||||
Reference in New Issue
Block a user