mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 04:08:28 +03:00
Compare commits
83 Commits
v1.1.1
...
feat-one-c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac4c09810b | ||
|
|
3194fee95d | ||
|
|
0040810e2e | ||
|
|
23a88e3cf4 | ||
|
|
3cac160cc1 | ||
|
|
e9a92272c5 | ||
|
|
5d6c5ac2b1 | ||
|
|
f1de07c9aa | ||
|
|
1e1e060a77 | ||
|
|
b43d229326 | ||
|
|
077b03fc61 | ||
|
|
cf77cfa64d | ||
|
|
bc66dd4f2a | ||
|
|
139d647197 | ||
|
|
f465b570cd | ||
|
|
52a7cabaf1 | ||
|
|
a739361e12 | ||
|
|
9a97fede43 | ||
|
|
2d13077fad | ||
|
|
8a4a314cf9 | ||
|
|
75e8b788ae | ||
|
|
defe2315bf | ||
|
|
b9ab26ed5a | ||
|
|
ba0dffdd5e | ||
|
|
a09927c507 | ||
|
|
6c938c489a | ||
|
|
0e39768da9 | ||
|
|
1e5d6ec4a6 | ||
|
|
3385d004cf | ||
|
|
7f27f7fce0 | ||
|
|
a6e3240af1 | ||
|
|
bf4c196cc2 | ||
|
|
c640cc898a | ||
|
|
3e2c0b564b | ||
|
|
5fd23baa55 | ||
|
|
8a450310c7 | ||
|
|
bef8a14089 | ||
|
|
cd0b093e7a | ||
|
|
096c9d09ef | ||
|
|
df3521e9ca | ||
|
|
a89d0193e4 | ||
|
|
536cbd905f | ||
|
|
a936921c4e | ||
|
|
68f672a4fa | ||
|
|
4ee0ad1cf8 | ||
|
|
bac809451c | ||
|
|
53dc9904ce | ||
|
|
c1f312d42a | ||
|
|
23c9dfe717 | ||
|
|
d02e7e0f02 | ||
|
|
56526a9ac7 | ||
|
|
3a22cc28c0 | ||
|
|
dbb3dfa04f | ||
|
|
01bdb35f5d | ||
|
|
43cbc6ac56 | ||
|
|
9c7cab1ae2 | ||
|
|
a9a0bae083 | ||
|
|
97d133ce79 | ||
|
|
432ee3dcfd | ||
|
|
94e0803fb3 | ||
|
|
794b4f6052 | ||
|
|
965d7d41dd | ||
|
|
e73faa70cc | ||
|
|
80beab9f23 | ||
|
|
200cea4e12 | ||
|
|
1256fde159 | ||
|
|
65e222e177 | ||
|
|
f2eb9ef784 | ||
|
|
2081c16555 | ||
|
|
d3efd7121c | ||
|
|
9d3cd5774b | ||
|
|
80d61e8b85 | ||
|
|
d36cdbfa87 | ||
|
|
c1506ee1cf | ||
|
|
3a34a49822 | ||
|
|
37c6d97275 | ||
|
|
7234eda85f | ||
|
|
a8c1ef3912 | ||
|
|
52ed8196a5 | ||
|
|
2051e8e491 | ||
|
|
21255db86a | ||
|
|
eae0da08b3 | ||
|
|
0d1447117c |
6
.github/workflows/docker-publish.yaml
vendored
6
.github/workflows/docker-publish.yaml
vendored
@@ -22,7 +22,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
@@ -33,14 +33,14 @@ jobs:
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051
|
||||
with:
|
||||
images: bellingcat/auto-archiver
|
||||
|
||||
|
||||
4
.github/workflows/python-publish.yaml
vendored
4
.github/workflows/python-publish.yaml
vendored
@@ -22,10 +22,10 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout Repository
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version-file: pyproject.toml
|
||||
|
||||
|
||||
6
.github/workflows/ruff.yaml
vendored
6
.github/workflows/ruff.yaml
vendored
@@ -20,11 +20,11 @@ jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
python-version: "3.12"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
6
.github/workflows/tests-core.yaml
vendored
6
.github/workflows/tests-core.yaml
vendored
@@ -26,13 +26,13 @@ jobs:
|
||||
working-directory: ./
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install ffmpeg
|
||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
@@ -40,7 +40,7 @@ jobs:
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Cache Poetry and pip artifacts
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry
|
||||
|
||||
29
.github/workflows/tests-deploy.yaml
vendored
Normal file
29
.github/workflows/tests-deploy.yaml
vendored
Normal file
@@ -0,0 +1,29 @@
|
||||
name: Deploy Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- deploy/**
|
||||
pull_request:
|
||||
paths:
|
||||
- deploy/**
|
||||
|
||||
jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Python 3.12
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.12"
|
||||
|
||||
- name: Install dependencies
|
||||
run: pip install pytest fastapi httpx python-multipart pyyaml
|
||||
|
||||
- name: Run Deploy Tests
|
||||
working-directory: deploy
|
||||
run: python -m pytest tests/ -v
|
||||
6
.github/workflows/tests-download.yaml
vendored
6
.github/workflows/tests-download.yaml
vendored
@@ -20,13 +20,13 @@ jobs:
|
||||
working-directory: ./
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Install ffmpeg
|
||||
run: sudo apt-get update && sudo apt-get install -y ffmpeg
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
run: pipx install poetry
|
||||
|
||||
- name: Cache Poetry and pip artifacts
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry
|
||||
|
||||
12
Dockerfile
12
Dockerfile
@@ -1,4 +1,4 @@
|
||||
FROM webrecorder/browsertrix-crawler:1.6.3 AS base
|
||||
FROM webrecorder/browsertrix-crawler:1.11.4 AS base
|
||||
|
||||
ENV RUNNING_IN_DOCKER=1 \
|
||||
LANG=C.UTF-8 \
|
||||
@@ -41,11 +41,21 @@ COPY ./src/ .
|
||||
RUN /poetry-venv/bin/poetry install --only main --no-cache
|
||||
|
||||
|
||||
# Run as non-root user to avoid permission issues with mounted volumes (see #342)
|
||||
# The base image already has an 'ubuntu' user at UID/GID 1000.
|
||||
# Ensure directories that need write access at runtime are writable.
|
||||
RUN chown 1000:1000 /app && \
|
||||
chown -R 1000:1000 /app/.venv/lib/python3.12/site-packages/seleniumbase/drivers/ && \
|
||||
mkdir -p /app/local_archive /app/secrets /tmp/archive && \
|
||||
chown -R 1000:1000 /app/local_archive /app/secrets /tmp/archive
|
||||
|
||||
# Update PATH to include virtual environment binaries
|
||||
# Allowing entry point to run the application directly with Python
|
||||
ENV VIRTUAL_ENV=/app/.venv \
|
||||
PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
USER 1000
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "auto_archiver"]
|
||||
|
||||
# should be executed with 2 volumes (3 if local_storage is used)
|
||||
|
||||
35
README.md
35
README.md
@@ -22,7 +22,40 @@ Auto Archiver is a Python tool to automatically archive content on the web in a
|
||||
Read the [article about Auto Archiver on bellingcat.com](https://www.bellingcat.com/resources/2022/09/22/preserve-vital-online-content-with-bellingcats-auto-archiver-tool/).
|
||||
|
||||
|
||||
## Installation
|
||||
## One-Click Cloud Deploy
|
||||
|
||||
Deploy your own Auto Archiver instance to the cloud — no coding required:
|
||||
|
||||
[](https://railway.app/new/template?template=https://github.com/bellingcat/auto-archiver&envs=AUTH_PASSWORD,GSHEET_URL,GOOGLE_SERVICE_ACCOUNT_JSON,POLL_INTERVAL,S3_BUCKET,S3_KEY,S3_SECRET,S3_REGION,TELEGRAM_API_ID,TELEGRAM_API_HASH,TELEGRAM_BOT_TOKEN,ENABLE_SCREENSHOTS,LOG_LEVEL&optionalEnvs=GSHEET_URL,GOOGLE_SERVICE_ACCOUNT_JSON,POLL_INTERVAL,S3_BUCKET,S3_KEY,S3_SECRET,S3_REGION,TELEGRAM_API_ID,TELEGRAM_API_HASH,TELEGRAM_BOT_TOKEN,ENABLE_SCREENSHOTS,LOG_LEVEL&AUTH_PASSWORDDesc=Password+to+access+your+archiver+web+interface&GSHEET_URLDesc=Google+Sheet+URL+to+monitor+for+new+URLs+(leave+empty+to+disable)&POLL_INTERVALDesc=Seconds+between+Google+Sheet+checks+(min+60)&POLL_INTERVALDefault=300&S3_BUCKETDesc=S3+bucket+name+for+storage+(leave+empty+for+local+only)&S3_REGIONDefault=us-east-1&LOG_LEVELDefault=INFO)
|
||||
|
||||
**What you get:** A web interface where you can paste URLs and archive them instantly. Optionally connect a Google Sheet for automated monitoring, S3 for cloud storage, and Telegram for archiving channels.
|
||||
|
||||
**Only required setting:** `AUTH_PASSWORD` — everything else is optional and can be configured later via the Railway dashboard.
|
||||
|
||||
<details>
|
||||
<summary>📋 Environment variables reference</summary>
|
||||
|
||||
| Variable | Required | Description |
|
||||
|----------|----------|-------------|
|
||||
| `AUTH_PASSWORD` | **Yes** | Password to access the web interface |
|
||||
| `GSHEET_URL` | No | Google Sheet URL to monitor for new URLs [use this template](https://docs.google.com/spreadsheets/d/1NJZo_XZUBKTI1Ghlgi4nTPVvCfb0HXAs6j5tNGas72k/edit?gid=0#gid=0) |
|
||||
| `GOOGLE_SERVICE_ACCOUNT_JSON` | No | Google service account JSON (required with Sheets) [follow these instructions](https://auto-archiver.readthedocs.io/en/v1.0.1/how_to/gsheets_setup.html) |
|
||||
| `POLL_INTERVAL` | No | Seconds between Sheet checks (default: 300) |
|
||||
| `S3_BUCKET` | No | S3 bucket name for archived content, ideal for cloud hosting your archives but not mandatory, any S3-compatible storage works |
|
||||
| `S3_KEY` / `S3_SECRET` | No | S3 credentials |
|
||||
| `S3_REGION` | No | S3 region (default: us-east-1) |
|
||||
| `S3_ENDPOINT` | No | S3 endpoint URL |
|
||||
| `TELEGRAM_API_ID` / `TELEGRAM_API_HASH` | No | Telegram API credentials |
|
||||
| `TELEGRAM_BOT_TOKEN` | No | Telegram bot token |
|
||||
| `ENABLE_SCREENSHOTS` | No | Set to `true` for full-page screenshots |
|
||||
| `ENABLE_THUMBNAILS` | No | Set to `true` for video thumbnails |
|
||||
| `ENABLE_CSV_DB` | No | Set to `true` for CSV logging |
|
||||
| `LOG_LEVEL` | No | DEBUG, INFO, WARNING, ERROR (default: INFO) |
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
## Traditional Installation
|
||||
|
||||
View the [Installation Guide](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html) for full instructions
|
||||
|
||||
|
||||
34
deploy/Dockerfile
Normal file
34
deploy/Dockerfile
Normal file
@@ -0,0 +1,34 @@
|
||||
# ── Cloud Deploy ──────────────────────────────────────────────────────
|
||||
# Thin web UI + config generator layer on top of the published
|
||||
# auto-archiver Docker image. Used by the Railway one-click deploy.
|
||||
#
|
||||
# Build:
|
||||
# docker build -f deploy/Dockerfile -t auto-archiver-deploy .
|
||||
#
|
||||
# Run:
|
||||
# docker run -p 8080:8080 -e PORT=8080 -e AUTH_PASSWORD=secret auto-archiver-deploy
|
||||
# ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
FROM bellingcat/auto-archiver:latest
|
||||
|
||||
USER root
|
||||
|
||||
# Install the lightweight web layer dependencies
|
||||
RUN pip install --no-cache-dir fastapi uvicorn[standard] python-multipart pyyaml
|
||||
|
||||
# Copy deploy scripts into the image
|
||||
COPY deploy/ /app/deploy/
|
||||
|
||||
# Ensure writable dirs exist
|
||||
RUN mkdir -p /app/local_archive /app/secrets && \
|
||||
chown -R 1000:1000 /app/local_archive /app/secrets /app/deploy
|
||||
|
||||
USER 1000
|
||||
|
||||
# Railway sets PORT; default to 8080
|
||||
ENV PORT=8080
|
||||
|
||||
EXPOSE ${PORT}
|
||||
|
||||
# Override the CLI entrypoint with the web server
|
||||
ENTRYPOINT ["python3", "-m", "deploy.start"]
|
||||
1
deploy/__init__.py
Normal file
1
deploy/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Cloud deployment layer for auto-archiver
|
||||
163
deploy/generate_config.py
Normal file
163
deploy/generate_config.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generates orchestration.yaml from environment variables.
|
||||
|
||||
This script bridges Railway's env-var-based configuration with
|
||||
auto-archiver's YAML-based configuration system. It runs at container
|
||||
startup before the web UI server starts.
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
CONFIG_PATH = Path("/app/secrets/orchestration.yaml")
|
||||
SECRETS_DIR = Path("/app/secrets")
|
||||
|
||||
|
||||
def build_config() -> dict:
|
||||
"""Build an orchestration config dict from environment variables."""
|
||||
|
||||
# -- Base config: always present ------------------------------------
|
||||
config = {
|
||||
"steps": {
|
||||
"feeders": ["cli_feeder"],
|
||||
"extractors": ["generic_extractor"],
|
||||
"enrichers": ["hash_enricher"],
|
||||
"databases": ["console_db"],
|
||||
"storages": ["local_storage"],
|
||||
"formatters": ["html_formatter"],
|
||||
},
|
||||
"logging": {
|
||||
"level": os.environ.get("LOG_LEVEL", "INFO"),
|
||||
},
|
||||
"local_storage": {
|
||||
"save_to": "/app/local_archive",
|
||||
"path_generator": "flat",
|
||||
"filename_generator": "static",
|
||||
},
|
||||
"generic_extractor": {
|
||||
"subtitles": os.environ.get("SUBTITLES", "false").lower() == "true",
|
||||
"comments": False,
|
||||
"livestreams": False,
|
||||
"live_from_start": False,
|
||||
"end_means_success": True,
|
||||
"allow_playlist": False,
|
||||
},
|
||||
"hash_enricher": {
|
||||
"algorithm": "SHA-256",
|
||||
},
|
||||
"html_formatter": {
|
||||
"detect_thumbnails": True,
|
||||
},
|
||||
"authentication": {},
|
||||
}
|
||||
|
||||
# -- Google Sheets feeder (optional) --------------------------------
|
||||
gsheet_url = os.environ.get("GSHEET_URL", "")
|
||||
if gsheet_url:
|
||||
config["steps"]["feeders"].append("gsheet_feeder")
|
||||
config["steps"]["databases"].append("gsheet_db")
|
||||
config["gsheet_feeder"] = {
|
||||
"sheet": gsheet_url,
|
||||
"header": 1,
|
||||
"service_account": str(SECRETS_DIR / "service_account.json"),
|
||||
"use_sheet_names_in_stored_paths": False,
|
||||
"columns": {
|
||||
"url": "link",
|
||||
"status": "archive status",
|
||||
"folder": "destination folder",
|
||||
"archive": "archive location",
|
||||
"date": "archive date",
|
||||
"thumbnail": "thumbnail",
|
||||
"timestamp": "upload timestamp",
|
||||
"title": "upload title",
|
||||
"text": "textual content",
|
||||
"screenshot": "screenshot",
|
||||
"hash": "hash",
|
||||
"pdq_hash": "perceptual hashes",
|
||||
},
|
||||
}
|
||||
|
||||
# -- Google service account JSON (optional) -------------------------
|
||||
sa_json = os.environ.get("GOOGLE_SERVICE_ACCOUNT_JSON", "")
|
||||
if sa_json:
|
||||
SECRETS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
sa_path = SECRETS_DIR / "service_account.json"
|
||||
sa_path.write_text(sa_json)
|
||||
print(f"[deploy] Wrote Google service account to {sa_path}")
|
||||
|
||||
# -- S3 storage (optional) ------------------------------------------
|
||||
s3_bucket = os.environ.get("S3_BUCKET", "")
|
||||
if s3_bucket:
|
||||
config["steps"]["storages"].append("s3_storage")
|
||||
config["s3_storage"] = {
|
||||
"bucket": s3_bucket,
|
||||
"region": os.environ.get("S3_REGION", "us-east-1"),
|
||||
"key": os.environ.get("S3_KEY", ""),
|
||||
"secret": os.environ.get("S3_SECRET", ""),
|
||||
"endpoint_url": os.environ.get("S3_ENDPOINT", "https://s3.{region}.amazonaws.com"),
|
||||
"cdn_url": os.environ.get(
|
||||
"S3_CDN_URL",
|
||||
"https://{bucket}.s3.{region}.amazonaws.com/{key}",
|
||||
),
|
||||
"private": os.environ.get("S3_PRIVATE", "false").lower() == "true",
|
||||
"random_no_duplicate": True,
|
||||
"key_path": "random",
|
||||
}
|
||||
|
||||
# -- Telegram extractor (optional) ----------------------------------
|
||||
tg_api_id = os.environ.get("TELEGRAM_API_ID", "")
|
||||
tg_api_hash = os.environ.get("TELEGRAM_API_HASH", "")
|
||||
if tg_api_id and tg_api_hash:
|
||||
config["steps"]["extractors"].append("telegram_extractor")
|
||||
config["telegram_extractor"] = {
|
||||
"api_id": tg_api_id,
|
||||
"api_hash": tg_api_hash,
|
||||
}
|
||||
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
||||
if bot_token:
|
||||
config["telegram_extractor"]["bot_token"] = bot_token
|
||||
|
||||
# -- Screenshot enricher (optional) ---------------------------------
|
||||
if os.environ.get("ENABLE_SCREENSHOTS", "").lower() == "true":
|
||||
config["steps"]["enrichers"].append("screenshot_enricher")
|
||||
config["screenshot_enricher"] = {
|
||||
"width": 1280,
|
||||
"height": 7200,
|
||||
"save_to_pdf": True,
|
||||
}
|
||||
|
||||
# -- Thumbnail enricher (optional) ----------------------------------
|
||||
if os.environ.get("ENABLE_THUMBNAILS", "").lower() == "true":
|
||||
config["steps"]["enrichers"].append("thumbnail_enricher")
|
||||
config["thumbnail_enricher"] = {
|
||||
"thumbnails_per_minute": 60,
|
||||
"max_thumbnails": 16,
|
||||
}
|
||||
|
||||
# -- CSV database (optional) ----------------------------------------
|
||||
if os.environ.get("ENABLE_CSV_DB", "").lower() == "true":
|
||||
config["steps"]["databases"].append("csv_db")
|
||||
config["csv_db"] = {
|
||||
"csv_file": "/app/local_archive/db.csv",
|
||||
}
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def main():
|
||||
config = build_config()
|
||||
|
||||
CONFIG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
||||
|
||||
print(f"[deploy] Generated config at {CONFIG_PATH}")
|
||||
print(f"[deploy] Active steps: {config['steps']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
71
deploy/gsheet_poller.py
Normal file
71
deploy/gsheet_poller.py
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Background Google Sheets poller for auto-archiver cloud deployments.
|
||||
|
||||
When GSHEET_URL is set, periodically runs auto-archiver with gsheet_feeder
|
||||
to check for new URLs in the configured spreadsheet. Runs as a daemon thread
|
||||
alongside the web UI.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
|
||||
logger = logging.getLogger("gsheet_poller")
|
||||
|
||||
CONFIG_PATH = "/app/secrets/orchestration.yaml"
|
||||
|
||||
|
||||
def _poll_once():
|
||||
"""Run auto-archiver once to process any new rows in the Google Sheet."""
|
||||
logger.info("Polling Google Sheet for new URLs...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["python3", "-m", "auto_archiver", "--config", CONFIG_PATH],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd="/app",
|
||||
timeout=600, # 10 minute timeout per poll
|
||||
)
|
||||
if result.returncode == 0:
|
||||
logger.info("Sheet poll completed successfully.")
|
||||
else:
|
||||
logger.warning("Sheet poll exited with code %d: %s", result.returncode, result.stderr[-500:])
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("Sheet poll timed out after 600s")
|
||||
except Exception:
|
||||
logger.exception("Sheet poll failed")
|
||||
|
||||
|
||||
def _poll_loop(interval: int):
|
||||
"""Run the poll loop at the given interval (seconds)."""
|
||||
logger.info("Google Sheets poller started (interval=%ds)", interval)
|
||||
while True:
|
||||
_poll_once()
|
||||
time.sleep(interval)
|
||||
|
||||
|
||||
def start_poller():
|
||||
"""
|
||||
Start the Google Sheets poller as a daemon thread if GSHEET_URL is set.
|
||||
Call this once at application startup.
|
||||
"""
|
||||
gsheet_url = os.environ.get("GSHEET_URL", "")
|
||||
if not gsheet_url:
|
||||
logger.info("GSHEET_URL not set – Sheet poller disabled.")
|
||||
return
|
||||
|
||||
interval = int(os.environ.get("POLL_INTERVAL", "300"))
|
||||
if interval < 60:
|
||||
interval = 60 # minimum 1 minute
|
||||
|
||||
thread = threading.Thread(
|
||||
target=_poll_loop,
|
||||
args=(interval,),
|
||||
daemon=True,
|
||||
name="gsheet-poller",
|
||||
)
|
||||
thread.start()
|
||||
logger.info("Google Sheets poller thread started.")
|
||||
2
deploy/pytest.ini
Normal file
2
deploy/pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
testpaths = tests
|
||||
37
deploy/start.py
Normal file
37
deploy/start.py
Normal file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Startup entrypoint for cloud deployments.
|
||||
|
||||
1. Generates orchestration.yaml from environment variables
|
||||
2. Starts the Google Sheets poller (if GSHEET_URL is set)
|
||||
3. Starts the FastAPI web UI
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||
)
|
||||
|
||||
# Generate config from env vars
|
||||
from deploy.generate_config import main as generate_config # noqa: E402
|
||||
|
||||
generate_config()
|
||||
|
||||
# Start gsheet poller (no-op if GSHEET_URL not set)
|
||||
from deploy.gsheet_poller import start_poller # noqa: E402
|
||||
|
||||
start_poller()
|
||||
|
||||
# Start web server
|
||||
import uvicorn # noqa: E402
|
||||
|
||||
port = int(os.environ.get("PORT", "8080"))
|
||||
uvicorn.run(
|
||||
"deploy.web_ui:app",
|
||||
host="0.0.0.0",
|
||||
port=port,
|
||||
log_level="info",
|
||||
)
|
||||
0
deploy/tests/__init__.py
Normal file
0
deploy/tests/__init__.py
Normal file
354
deploy/tests/test_generate_config.py
Normal file
354
deploy/tests/test_generate_config.py
Normal file
@@ -0,0 +1,354 @@
|
||||
"""Tests for deploy/generate_config.py – config generation from env vars."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import yaml
|
||||
|
||||
from deploy.generate_config import build_config, main
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _env(**overrides):
|
||||
"""Return a clean env dict with only the given overrides (no leak from host)."""
|
||||
# Clear all deploy-relevant env vars, then apply overrides
|
||||
deploy_vars = [
|
||||
"LOG_LEVEL",
|
||||
"SUBTITLES",
|
||||
"GSHEET_URL",
|
||||
"GOOGLE_SERVICE_ACCOUNT_JSON",
|
||||
"S3_BUCKET",
|
||||
"S3_KEY",
|
||||
"S3_SECRET",
|
||||
"S3_REGION",
|
||||
"S3_ENDPOINT",
|
||||
"S3_CDN_URL",
|
||||
"S3_PRIVATE",
|
||||
"TELEGRAM_API_ID",
|
||||
"TELEGRAM_API_HASH",
|
||||
"TELEGRAM_BOT_TOKEN",
|
||||
"ENABLE_SCREENSHOTS",
|
||||
"ENABLE_THUMBNAILS",
|
||||
"ENABLE_CSV_DB",
|
||||
]
|
||||
clean = {k: v for k, v in os.environ.items() if k not in deploy_vars}
|
||||
clean.update(overrides)
|
||||
return clean
|
||||
|
||||
|
||||
# ── Base config (no optional env vars) ────────────────────────────────
|
||||
|
||||
|
||||
class TestBaseConfig:
|
||||
"""When no optional env vars are set, build_config returns a minimal working config."""
|
||||
|
||||
def test_base_steps(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
steps = cfg["steps"]
|
||||
assert steps["feeders"] == ["cli_feeder"]
|
||||
assert steps["extractors"] == ["generic_extractor"]
|
||||
assert steps["enrichers"] == ["hash_enricher"]
|
||||
assert steps["databases"] == ["console_db"]
|
||||
assert steps["storages"] == ["local_storage"]
|
||||
assert steps["formatters"] == ["html_formatter"]
|
||||
|
||||
def test_base_has_required_module_configs(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert "local_storage" in cfg
|
||||
assert "generic_extractor" in cfg
|
||||
assert "hash_enricher" in cfg
|
||||
assert "html_formatter" in cfg
|
||||
|
||||
def test_default_log_level_is_info(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["logging"]["level"] == "INFO"
|
||||
|
||||
def test_custom_log_level(self):
|
||||
with patch.dict(os.environ, _env(LOG_LEVEL="DEBUG"), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["logging"]["level"] == "DEBUG"
|
||||
|
||||
def test_authentication_present_and_empty(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["authentication"] == {}
|
||||
|
||||
def test_local_storage_defaults(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
ls = cfg["local_storage"]
|
||||
assert ls["save_to"] == "/app/local_archive"
|
||||
assert ls["path_generator"] == "flat"
|
||||
assert ls["filename_generator"] == "static"
|
||||
|
||||
def test_subtitles_default_false(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["generic_extractor"]["subtitles"] is False
|
||||
|
||||
def test_subtitles_enabled(self):
|
||||
with patch.dict(os.environ, _env(SUBTITLES="true"), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["generic_extractor"]["subtitles"] is True
|
||||
|
||||
def test_subtitles_case_insensitive(self):
|
||||
with patch.dict(os.environ, _env(SUBTITLES="True"), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["generic_extractor"]["subtitles"] is True
|
||||
|
||||
def test_no_optional_modules_present(self):
|
||||
"""Ensure optional modules don't appear when their env vars are absent."""
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert "gsheet_feeder" not in cfg
|
||||
assert "s3_storage" not in cfg
|
||||
assert "telegram_extractor" not in cfg
|
||||
assert "screenshot_enricher" not in cfg
|
||||
assert "thumbnail_enricher" not in cfg
|
||||
assert "csv_db" not in cfg
|
||||
|
||||
def test_config_is_valid_yaml(self):
|
||||
"""The output dict should round-trip through YAML cleanly."""
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
dumped = yaml.dump(cfg)
|
||||
reloaded = yaml.safe_load(dumped)
|
||||
assert reloaded == cfg
|
||||
|
||||
|
||||
# ── Google Sheets ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestGSheetConfig:
|
||||
def test_gsheet_adds_feeder_and_db(self):
|
||||
with patch.dict(os.environ, _env(GSHEET_URL="https://docs.google.com/spreadsheets/d/abc"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "gsheet_feeder" in cfg["steps"]["feeders"]
|
||||
assert "gsheet_db" in cfg["steps"]["databases"]
|
||||
|
||||
def test_gsheet_feeder_config(self):
|
||||
url = "https://docs.google.com/spreadsheets/d/abc123"
|
||||
with patch.dict(os.environ, _env(GSHEET_URL=url), clear=True):
|
||||
cfg = build_config()
|
||||
gf = cfg["gsheet_feeder"]
|
||||
assert gf["sheet"] == url
|
||||
assert gf["header"] == 1
|
||||
assert "service_account" in gf
|
||||
assert gf["columns"]["url"] == "link"
|
||||
assert gf["columns"]["status"] == "archive status"
|
||||
|
||||
def test_gsheet_preserves_cli_feeder(self):
|
||||
"""cli_feeder should still be present even when gsheet is added."""
|
||||
with patch.dict(os.environ, _env(GSHEET_URL="https://example.com/sheet"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "cli_feeder" in cfg["steps"]["feeders"]
|
||||
|
||||
def test_service_account_json_written(self, tmp_path):
|
||||
"""When GOOGLE_SERVICE_ACCOUNT_JSON is set, it writes the file."""
|
||||
sa_data = json.dumps({"type": "service_account", "project_id": "test"})
|
||||
secrets_dir = tmp_path / "secrets"
|
||||
with (
|
||||
patch.dict(os.environ, _env(GOOGLE_SERVICE_ACCOUNT_JSON=sa_data), clear=True),
|
||||
patch("deploy.generate_config.SECRETS_DIR", secrets_dir),
|
||||
):
|
||||
build_config()
|
||||
sa_path = secrets_dir / "service_account.json"
|
||||
assert sa_path.exists()
|
||||
assert json.loads(sa_path.read_text())["project_id"] == "test"
|
||||
|
||||
|
||||
# ── S3 storage ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestS3Config:
|
||||
def test_s3_adds_storage(self):
|
||||
with patch.dict(os.environ, _env(S3_BUCKET="my-bucket"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "s3_storage" in cfg["steps"]["storages"]
|
||||
assert "local_storage" in cfg["steps"]["storages"] # local still there
|
||||
|
||||
def test_s3_config_values(self):
|
||||
env = _env(
|
||||
S3_BUCKET="my-bucket",
|
||||
S3_KEY="AKID",
|
||||
S3_SECRET="shhh",
|
||||
S3_REGION="eu-west-1",
|
||||
)
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
s3 = cfg["s3_storage"]
|
||||
assert s3["bucket"] == "my-bucket"
|
||||
assert s3["key"] == "AKID"
|
||||
assert s3["secret"] == "shhh"
|
||||
assert s3["region"] == "eu-west-1"
|
||||
assert s3["private"] is False
|
||||
assert s3["random_no_duplicate"] is True
|
||||
|
||||
def test_s3_defaults(self):
|
||||
with patch.dict(os.environ, _env(S3_BUCKET="b"), clear=True):
|
||||
cfg = build_config()
|
||||
s3 = cfg["s3_storage"]
|
||||
assert s3["region"] == "us-east-1"
|
||||
assert "{region}" in s3["endpoint_url"]
|
||||
|
||||
def test_s3_private_flag(self):
|
||||
with patch.dict(os.environ, _env(S3_BUCKET="b", S3_PRIVATE="true"), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["s3_storage"]["private"] is True
|
||||
|
||||
def test_s3_custom_endpoint(self):
|
||||
endpoint = "https://nyc3.digitaloceanspaces.com"
|
||||
with patch.dict(os.environ, _env(S3_BUCKET="b", S3_ENDPOINT=endpoint), clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["s3_storage"]["endpoint_url"] == endpoint
|
||||
|
||||
|
||||
# ── Telegram ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestTelegramConfig:
|
||||
def test_telegram_added_when_both_set(self):
|
||||
env = _env(TELEGRAM_API_ID="12345", TELEGRAM_API_HASH="abc")
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
assert "telegram_extractor" in cfg["steps"]["extractors"]
|
||||
assert cfg["telegram_extractor"]["api_id"] == "12345"
|
||||
assert cfg["telegram_extractor"]["api_hash"] == "abc"
|
||||
|
||||
def test_telegram_not_added_if_only_id(self):
|
||||
with patch.dict(os.environ, _env(TELEGRAM_API_ID="12345"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "telegram_extractor" not in cfg["steps"]["extractors"]
|
||||
|
||||
def test_telegram_not_added_if_only_hash(self):
|
||||
with patch.dict(os.environ, _env(TELEGRAM_API_HASH="abc"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "telegram_extractor" not in cfg["steps"]["extractors"]
|
||||
|
||||
def test_telegram_bot_token_optional(self):
|
||||
env = _env(TELEGRAM_API_ID="12345", TELEGRAM_API_HASH="abc", TELEGRAM_BOT_TOKEN="bot:tok")
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
assert cfg["telegram_extractor"]["bot_token"] == "bot:tok"
|
||||
|
||||
def test_telegram_no_bot_token(self):
|
||||
env = _env(TELEGRAM_API_ID="12345", TELEGRAM_API_HASH="abc")
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
assert "bot_token" not in cfg["telegram_extractor"]
|
||||
|
||||
|
||||
# ── Optional enrichers / databases ────────────────────────────────────
|
||||
|
||||
|
||||
class TestOptionalModules:
|
||||
def test_screenshots_disabled_by_default(self):
|
||||
with patch.dict(os.environ, _env(), clear=True):
|
||||
cfg = build_config()
|
||||
assert "screenshot_enricher" not in cfg["steps"]["enrichers"]
|
||||
|
||||
def test_screenshots_enabled(self):
|
||||
with patch.dict(os.environ, _env(ENABLE_SCREENSHOTS="true"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "screenshot_enricher" in cfg["steps"]["enrichers"]
|
||||
assert cfg["screenshot_enricher"]["width"] == 1280
|
||||
|
||||
def test_thumbnails_enabled(self):
|
||||
with patch.dict(os.environ, _env(ENABLE_THUMBNAILS="true"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "thumbnail_enricher" in cfg["steps"]["enrichers"]
|
||||
assert cfg["thumbnail_enricher"]["max_thumbnails"] == 16
|
||||
|
||||
def test_csv_db_enabled(self):
|
||||
with patch.dict(os.environ, _env(ENABLE_CSV_DB="true"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "csv_db" in cfg["steps"]["databases"]
|
||||
assert cfg["csv_db"]["csv_file"] == "/app/local_archive/db.csv"
|
||||
|
||||
def test_case_insensitive_boolean(self):
|
||||
with patch.dict(os.environ, _env(ENABLE_SCREENSHOTS="TRUE"), clear=True):
|
||||
cfg = build_config()
|
||||
assert "screenshot_enricher" in cfg["steps"]["enrichers"]
|
||||
|
||||
|
||||
# ── Combined / full config ────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestCombinedConfig:
|
||||
def test_all_optional_modules_together(self):
|
||||
"""Enable everything at once and verify no conflicts."""
|
||||
env = _env(
|
||||
GSHEET_URL="https://example.com/sheet",
|
||||
S3_BUCKET="bucket",
|
||||
S3_KEY="key",
|
||||
S3_SECRET="secret",
|
||||
TELEGRAM_API_ID="123",
|
||||
TELEGRAM_API_HASH="abc",
|
||||
TELEGRAM_BOT_TOKEN="tok",
|
||||
ENABLE_SCREENSHOTS="true",
|
||||
ENABLE_THUMBNAILS="true",
|
||||
ENABLE_CSV_DB="true",
|
||||
)
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
|
||||
steps = cfg["steps"]
|
||||
assert "gsheet_feeder" in steps["feeders"]
|
||||
assert "telegram_extractor" in steps["extractors"]
|
||||
assert "screenshot_enricher" in steps["enrichers"]
|
||||
assert "thumbnail_enricher" in steps["enrichers"]
|
||||
assert "csv_db" in steps["databases"]
|
||||
assert "gsheet_db" in steps["databases"]
|
||||
assert "s3_storage" in steps["storages"]
|
||||
assert "local_storage" in steps["storages"]
|
||||
|
||||
# All module configs present
|
||||
for key in [
|
||||
"gsheet_feeder",
|
||||
"s3_storage",
|
||||
"telegram_extractor",
|
||||
"screenshot_enricher",
|
||||
"thumbnail_enricher",
|
||||
"csv_db",
|
||||
]:
|
||||
assert key in cfg, f"{key} config missing"
|
||||
|
||||
def test_full_config_valid_yaml(self):
|
||||
env = _env(
|
||||
GSHEET_URL="https://example.com/sheet",
|
||||
S3_BUCKET="bucket",
|
||||
TELEGRAM_API_ID="123",
|
||||
TELEGRAM_API_HASH="abc",
|
||||
ENABLE_SCREENSHOTS="true",
|
||||
ENABLE_CSV_DB="true",
|
||||
)
|
||||
with patch.dict(os.environ, env, clear=True):
|
||||
cfg = build_config()
|
||||
dumped = yaml.dump(cfg)
|
||||
reloaded = yaml.safe_load(dumped)
|
||||
assert reloaded == cfg
|
||||
|
||||
|
||||
# ── main() writes file ───────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestMainFunction:
|
||||
def test_main_writes_config_file(self, tmp_path):
|
||||
config_path = tmp_path / "orchestration.yaml"
|
||||
with patch.dict(os.environ, _env(), clear=True), patch("deploy.generate_config.CONFIG_PATH", config_path):
|
||||
main()
|
||||
assert config_path.exists()
|
||||
cfg = yaml.safe_load(config_path.read_text())
|
||||
assert cfg["steps"]["feeders"] == ["cli_feeder"]
|
||||
|
||||
def test_main_creates_parent_dirs(self, tmp_path):
|
||||
config_path = tmp_path / "nested" / "dir" / "orchestration.yaml"
|
||||
with patch.dict(os.environ, _env(), clear=True), patch("deploy.generate_config.CONFIG_PATH", config_path):
|
||||
main()
|
||||
assert config_path.exists()
|
||||
124
deploy/tests/test_gsheet_poller.py
Normal file
124
deploy/tests/test_gsheet_poller.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Tests for deploy/gsheet_poller.py – background Google Sheets polling."""
|
||||
|
||||
import os
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
|
||||
from deploy.gsheet_poller import start_poller, _poll_once
|
||||
|
||||
|
||||
# ── start_poller ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestStartPoller:
|
||||
def test_disabled_when_no_gsheet_url(self):
|
||||
"""No thread should be started when GSHEET_URL is empty."""
|
||||
with (
|
||||
patch.dict(os.environ, {"GSHEET_URL": ""}, clear=False),
|
||||
patch("deploy.gsheet_poller.threading.Thread") as mock_thread,
|
||||
):
|
||||
start_poller()
|
||||
mock_thread.assert_not_called()
|
||||
|
||||
def test_disabled_when_gsheet_url_absent(self):
|
||||
env = {k: v for k, v in os.environ.items() if k != "GSHEET_URL"}
|
||||
with patch.dict(os.environ, env, clear=True), patch("deploy.gsheet_poller.threading.Thread") as mock_thread:
|
||||
start_poller()
|
||||
mock_thread.assert_not_called()
|
||||
|
||||
def test_starts_thread_when_gsheet_url_set(self):
|
||||
with (
|
||||
patch.dict(os.environ, {"GSHEET_URL": "https://example.com/sheet"}, clear=False),
|
||||
patch("deploy.gsheet_poller.threading.Thread") as mock_thread,
|
||||
):
|
||||
mock_instance = MagicMock()
|
||||
mock_thread.return_value = mock_instance
|
||||
start_poller()
|
||||
mock_thread.assert_called_once()
|
||||
assert mock_thread.call_args.kwargs["daemon"] is True
|
||||
assert mock_thread.call_args.kwargs["name"] == "gsheet-poller"
|
||||
mock_instance.start.assert_called_once()
|
||||
|
||||
def test_default_interval_300(self):
|
||||
env = {"GSHEET_URL": "https://example.com/sheet"}
|
||||
# Remove POLL_INTERVAL if present
|
||||
clean_env = {k: v for k, v in os.environ.items() if k != "POLL_INTERVAL"}
|
||||
clean_env.update(env)
|
||||
with (
|
||||
patch.dict(os.environ, clean_env, clear=True),
|
||||
patch("deploy.gsheet_poller.threading.Thread") as mock_thread,
|
||||
):
|
||||
mock_thread.return_value = MagicMock()
|
||||
start_poller()
|
||||
# interval should be passed as arg to _poll_loop
|
||||
args = mock_thread.call_args.kwargs.get("args") or mock_thread.call_args[1].get("args")
|
||||
assert args == (300,)
|
||||
|
||||
def test_custom_interval(self):
|
||||
with (
|
||||
patch.dict(os.environ, {"GSHEET_URL": "x", "POLL_INTERVAL": "600"}, clear=False),
|
||||
patch("deploy.gsheet_poller.threading.Thread") as mock_thread,
|
||||
):
|
||||
mock_thread.return_value = MagicMock()
|
||||
start_poller()
|
||||
args = mock_thread.call_args.kwargs.get("args") or mock_thread.call_args[1].get("args")
|
||||
assert args == (600,)
|
||||
|
||||
def test_interval_minimum_enforced(self):
|
||||
"""Intervals below 60 should be clamped to 60."""
|
||||
with (
|
||||
patch.dict(os.environ, {"GSHEET_URL": "x", "POLL_INTERVAL": "10"}, clear=False),
|
||||
patch("deploy.gsheet_poller.threading.Thread") as mock_thread,
|
||||
):
|
||||
mock_thread.return_value = MagicMock()
|
||||
start_poller()
|
||||
args = mock_thread.call_args.kwargs.get("args") or mock_thread.call_args[1].get("args")
|
||||
assert args == (60,)
|
||||
|
||||
|
||||
# ── _poll_once ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestPollOnce:
|
||||
def test_calls_subprocess_with_config(self):
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=0, stderr="")
|
||||
_poll_once()
|
||||
mock_run.assert_called_once()
|
||||
cmd = mock_run.call_args[0][0]
|
||||
assert "auto_archiver" in " ".join(cmd)
|
||||
assert "--config" in cmd
|
||||
|
||||
def test_handles_nonzero_exit(self):
|
||||
"""Should not raise on non-zero exit, just log a warning."""
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=1, stderr="some error")
|
||||
_poll_once() # should not raise
|
||||
|
||||
def test_handles_timeout(self):
|
||||
"""Should not raise on timeout, just log."""
|
||||
import subprocess
|
||||
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.side_effect = subprocess.TimeoutExpired(cmd="test", timeout=600)
|
||||
_poll_once() # should not raise
|
||||
|
||||
def test_handles_exception(self):
|
||||
"""Should not raise on arbitrary exceptions."""
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.side_effect = OSError("broken")
|
||||
_poll_once() # should not raise
|
||||
|
||||
def test_uses_correct_config_path(self):
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=0, stderr="")
|
||||
_poll_once()
|
||||
cmd = mock_run.call_args[0][0]
|
||||
config_idx = cmd.index("--config")
|
||||
assert cmd[config_idx + 1] == "/app/secrets/orchestration.yaml"
|
||||
|
||||
def test_timeout_set(self):
|
||||
with patch("deploy.gsheet_poller.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(returncode=0, stderr="")
|
||||
_poll_once()
|
||||
assert mock_run.call_args[1]["timeout"] == 600
|
||||
310
deploy/tests/test_web_ui.py
Normal file
310
deploy/tests/test_web_ui.py
Normal file
@@ -0,0 +1,310 @@
|
||||
"""Tests for deploy/web_ui.py – FastAPI web interface."""
|
||||
|
||||
from unittest.mock import patch, AsyncMock
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
# ── Fixtures ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_state():
|
||||
"""Reset in-memory state between tests."""
|
||||
import deploy.web_ui as mod
|
||||
|
||||
mod._valid_sessions.clear()
|
||||
mod._jobs.clear()
|
||||
yield
|
||||
mod._valid_sessions.clear()
|
||||
mod._jobs.clear()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client_no_auth():
|
||||
"""Test client with auth disabled (no AUTH_PASSWORD)."""
|
||||
with patch.object(__import__("deploy.web_ui", fromlist=["web_ui"]), "AUTH_PASSWORD", ""):
|
||||
from deploy.web_ui import app
|
||||
|
||||
yield TestClient(app, raise_server_exceptions=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client_with_auth():
|
||||
"""Test client with auth enabled."""
|
||||
with patch.object(__import__("deploy.web_ui", fromlist=["web_ui"]), "AUTH_PASSWORD", "secret123"):
|
||||
from deploy.web_ui import app
|
||||
|
||||
yield TestClient(app, raise_server_exceptions=False)
|
||||
|
||||
|
||||
def _login(client, password="secret123"):
|
||||
"""Helper: log in and return the session cookie."""
|
||||
resp = client.post("/login", data={"password": password}, follow_redirects=False)
|
||||
return resp.cookies.get("aa_session")
|
||||
|
||||
|
||||
# ── Health check ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestHealthCheck:
|
||||
def test_status_returns_ok(self, client_no_auth):
|
||||
resp = client_no_auth.get("/status")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == {"status": "ok"}
|
||||
|
||||
def test_status_no_auth_required(self, client_with_auth):
|
||||
resp = client_with_auth.get("/status")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == {"status": "ok"}
|
||||
|
||||
|
||||
# ── Auth disabled ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestNoAuth:
|
||||
def test_index_accessible(self, client_no_auth):
|
||||
resp = client_no_auth.get("/")
|
||||
assert resp.status_code == 200
|
||||
assert "Auto Archiver" in resp.text
|
||||
|
||||
def test_login_page_redirects_to_index(self, client_no_auth):
|
||||
resp = client_no_auth.get("/login", follow_redirects=False)
|
||||
assert resp.status_code == 302
|
||||
assert resp.headers["location"] == "/"
|
||||
|
||||
def test_login_post_redirects_to_index(self, client_no_auth):
|
||||
resp = client_no_auth.post("/login", data={"password": "anything"}, follow_redirects=False)
|
||||
assert resp.status_code == 302
|
||||
|
||||
def test_no_logout_link_shown(self, client_no_auth):
|
||||
resp = client_no_auth.get("/")
|
||||
assert "Logout" not in resp.text
|
||||
|
||||
|
||||
# ── Auth enabled ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestAuth:
|
||||
def test_index_redirects_to_login(self, client_with_auth):
|
||||
resp = client_with_auth.get("/", follow_redirects=False)
|
||||
assert resp.status_code == 307
|
||||
assert resp.headers["location"] == "/login"
|
||||
|
||||
def test_login_page_renders(self, client_with_auth):
|
||||
resp = client_with_auth.get("/login")
|
||||
assert resp.status_code == 200
|
||||
assert "Password" in resp.text
|
||||
|
||||
def test_wrong_password_returns_401(self, client_with_auth):
|
||||
resp = client_with_auth.post("/login", data={"password": "wrong"})
|
||||
assert resp.status_code == 401
|
||||
assert "Wrong password" in resp.text
|
||||
|
||||
def test_correct_password_sets_cookie(self, client_with_auth):
|
||||
resp = client_with_auth.post("/login", data={"password": "secret123"}, follow_redirects=False)
|
||||
assert resp.status_code == 302
|
||||
assert "aa_session" in resp.cookies
|
||||
|
||||
def test_authenticated_access(self, client_with_auth):
|
||||
cookie = _login(client_with_auth)
|
||||
client_with_auth.cookies.set("aa_session", cookie)
|
||||
resp = client_with_auth.get("/")
|
||||
assert resp.status_code == 200
|
||||
assert "Auto Archiver" in resp.text
|
||||
|
||||
def test_logout_clears_session(self, client_with_auth):
|
||||
cookie = _login(client_with_auth)
|
||||
client_with_auth.cookies.set("aa_session", cookie)
|
||||
resp = client_with_auth.get("/logout", follow_redirects=False)
|
||||
assert resp.status_code == 302
|
||||
# After logout, index should redirect to login again
|
||||
client_with_auth.cookies.clear()
|
||||
resp = client_with_auth.get("/", follow_redirects=False)
|
||||
assert resp.status_code == 307
|
||||
|
||||
def test_logout_link_shown_when_auth_enabled(self, client_with_auth):
|
||||
cookie = _login(client_with_auth)
|
||||
client_with_auth.cookies.set("aa_session", cookie)
|
||||
resp = client_with_auth.get("/")
|
||||
assert "Logout" in resp.text
|
||||
|
||||
def test_results_requires_auth(self, client_with_auth):
|
||||
resp = client_with_auth.get("/results", follow_redirects=False)
|
||||
assert resp.status_code == 307
|
||||
|
||||
def test_invalid_session_rejected(self, client_with_auth):
|
||||
client_with_auth.cookies.set("aa_session", "bogus-token")
|
||||
resp = client_with_auth.get("/", follow_redirects=False)
|
||||
assert resp.status_code == 307
|
||||
|
||||
|
||||
# ── Archive submission ────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestArchive:
|
||||
def test_archive_creates_job(self, client_no_auth):
|
||||
with patch("deploy.web_ui._run_archive", new_callable=AsyncMock):
|
||||
resp = client_no_auth.post(
|
||||
"/archive",
|
||||
data={"urls": "https://example.com\nhttps://example.org"},
|
||||
follow_redirects=False,
|
||||
)
|
||||
assert resp.status_code == 303
|
||||
assert resp.headers["location"] == "/"
|
||||
|
||||
from deploy.web_ui import _jobs
|
||||
|
||||
assert len(_jobs) == 1
|
||||
assert _jobs[0]["urls"] == ["https://example.com", "https://example.org"]
|
||||
assert _jobs[0]["status"] == "running"
|
||||
|
||||
def test_archive_empty_urls_returns_400(self, client_no_auth):
|
||||
resp = client_no_auth.post("/archive", data={"urls": " \n \n"})
|
||||
assert resp.status_code == 400
|
||||
|
||||
def test_archive_strips_whitespace(self, client_no_auth):
|
||||
with patch("deploy.web_ui._run_archive", new_callable=AsyncMock):
|
||||
client_no_auth.post(
|
||||
"/archive",
|
||||
data={"urls": " https://example.com \n\n https://example.org \n"},
|
||||
follow_redirects=False,
|
||||
)
|
||||
from deploy.web_ui import _jobs
|
||||
|
||||
assert _jobs[0]["urls"] == ["https://example.com", "https://example.org"]
|
||||
|
||||
def test_archive_requires_auth(self, client_with_auth):
|
||||
resp = client_with_auth.post(
|
||||
"/archive",
|
||||
data={"urls": "https://example.com"},
|
||||
follow_redirects=False,
|
||||
)
|
||||
assert resp.status_code == 307
|
||||
|
||||
|
||||
# ── Results page ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestResults:
|
||||
def test_results_empty(self, client_no_auth, tmp_path):
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", tmp_path):
|
||||
resp = client_no_auth.get("/results")
|
||||
assert resp.status_code == 200
|
||||
assert "No archived files yet" in resp.text
|
||||
|
||||
def test_results_lists_files(self, client_no_auth, tmp_path):
|
||||
(tmp_path / "test.html").write_text("<html>archived</html>")
|
||||
(tmp_path / "video.mp4").write_bytes(b"\x00" * 10)
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", tmp_path):
|
||||
resp = client_no_auth.get("/results")
|
||||
assert resp.status_code == 200
|
||||
assert "test.html" in resp.text
|
||||
assert "video.mp4" in resp.text
|
||||
|
||||
def test_results_nonexistent_dir(self, client_no_auth, tmp_path):
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", tmp_path / "nonexistent"):
|
||||
resp = client_no_auth.get("/results")
|
||||
assert resp.status_code == 200
|
||||
assert "No archived files yet" in resp.text
|
||||
|
||||
|
||||
# ── File serving ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestFileServing:
|
||||
def test_serve_existing_file(self, client_no_auth, tmp_path):
|
||||
(tmp_path / "report.html").write_text("<html>done</html>")
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", tmp_path):
|
||||
resp = client_no_auth.get("/files/report.html")
|
||||
assert resp.status_code == 200
|
||||
|
||||
def test_serve_nonexistent_file(self, client_no_auth, tmp_path):
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", tmp_path):
|
||||
resp = client_no_auth.get("/files/nope.txt")
|
||||
assert resp.status_code == 404
|
||||
|
||||
def test_path_traversal_blocked(self, client_no_auth, tmp_path):
|
||||
# Create a file outside the archive dir
|
||||
outside = tmp_path / "outside"
|
||||
outside.mkdir()
|
||||
(outside / "secret.txt").write_text("secret")
|
||||
archive = tmp_path / "archive"
|
||||
archive.mkdir()
|
||||
# Symlink into archive pointing outside
|
||||
(archive / "escape").symlink_to(outside / "secret.txt")
|
||||
with patch("deploy.web_ui.ARCHIVE_DIR", archive):
|
||||
resp = client_no_auth.get("/files/escape")
|
||||
assert resp.status_code == 403
|
||||
|
||||
|
||||
# ── Job rendering ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestJobRendering:
|
||||
def test_no_jobs_shows_message(self, client_no_auth):
|
||||
resp = client_no_auth.get("/")
|
||||
assert "No archiving jobs yet" in resp.text
|
||||
|
||||
def test_jobs_shown_in_table(self, client_no_auth):
|
||||
from deploy.web_ui import _jobs
|
||||
|
||||
_jobs.append(
|
||||
{
|
||||
"id": 1,
|
||||
"urls": ["https://example.com"],
|
||||
"status": "done",
|
||||
"started": "2026-01-01 00:00 UTC",
|
||||
"output": "",
|
||||
}
|
||||
)
|
||||
resp = client_no_auth.get("/")
|
||||
assert "example.com" in resp.text
|
||||
assert "done" in resp.text
|
||||
|
||||
def test_many_urls_truncated(self, client_no_auth):
|
||||
from deploy.web_ui import _jobs
|
||||
|
||||
_jobs.append(
|
||||
{
|
||||
"id": 1,
|
||||
"urls": [f"https://example.com/{i}" for i in range(10)],
|
||||
"status": "running",
|
||||
"started": "2026-01-01 00:00 UTC",
|
||||
"output": "",
|
||||
}
|
||||
)
|
||||
resp = client_no_auth.get("/")
|
||||
assert "+7 more" in resp.text
|
||||
|
||||
|
||||
# ── HTML template rendering ──────────────────────────────────────────
|
||||
|
||||
|
||||
class TestTemplates:
|
||||
"""Verify HTML templates can be .format()-ed without KeyError."""
|
||||
|
||||
def test_login_html_renders(self):
|
||||
from deploy.web_ui import LOGIN_HTML
|
||||
|
||||
result = LOGIN_HTML.format(error="")
|
||||
assert "Auto Archiver" in result
|
||||
|
||||
def test_login_html_renders_with_error(self):
|
||||
from deploy.web_ui import LOGIN_HTML
|
||||
|
||||
result = LOGIN_HTML.format(error='<p class="err">Nope</p>')
|
||||
assert "Nope" in result
|
||||
|
||||
def test_main_html_renders(self):
|
||||
from deploy.web_ui import MAIN_HTML
|
||||
|
||||
result = MAIN_HTML.format(logout="", jobs_html="")
|
||||
assert "Auto Archiver" in result
|
||||
|
||||
def test_results_html_renders(self):
|
||||
from deploy.web_ui import RESULTS_HTML
|
||||
|
||||
result = RESULTS_HTML.format(file_list="<p>empty</p>")
|
||||
assert "Archived Files" in result
|
||||
269
deploy/web_ui.py
Normal file
269
deploy/web_ui.py
Normal file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Minimal web UI for auto-archiver cloud deployments.
|
||||
|
||||
Provides:
|
||||
- GET / → HTML form to submit URLs for archiving
|
||||
- POST /archive → Runs auto-archiver on submitted URLs
|
||||
- GET /results → Lists archived files available for download
|
||||
- GET /files/{path} → Serves archived files
|
||||
- GET /status → Health check
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import html
|
||||
import os
|
||||
import secrets
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import Depends, FastAPI, Form, HTTPException, Request, status
|
||||
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
|
||||
AUTH_PASSWORD = os.environ.get("AUTH_PASSWORD", "")
|
||||
ARCHIVE_DIR = Path("/app/local_archive")
|
||||
CONFIG_PATH = Path("/app/secrets/orchestration.yaml")
|
||||
COOKIE_NAME = "aa_session"
|
||||
|
||||
# In-memory session tokens (reset on restart, which is fine for this use case)
|
||||
_valid_sessions: set[str] = set()
|
||||
# In-memory job log
|
||||
_jobs: list[dict] = []
|
||||
|
||||
app = FastAPI(title="Auto Archiver", docs_url=None, redoc_url=None)
|
||||
|
||||
|
||||
# ── Auth helpers ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _check_auth(request: Request):
|
||||
"""Dependency: redirect to /login if auth is enabled and session is missing."""
|
||||
if not AUTH_PASSWORD:
|
||||
return # auth disabled
|
||||
token = request.cookies.get(COOKIE_NAME, "")
|
||||
if token not in _valid_sessions:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_307_TEMPORARY_REDIRECT,
|
||||
headers={"Location": "/login"},
|
||||
)
|
||||
|
||||
|
||||
# ── Pages ─────────────────────────────────────────────────────────────
|
||||
|
||||
LOGIN_HTML = """<!DOCTYPE html>
|
||||
<html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>Auto Archiver – Login</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; max-width: 420px; margin: 80px auto; padding: 0 1rem; }}
|
||||
h1 {{ font-size: 1.4rem; }}
|
||||
input[type=password], button {{ font-size: 1rem; padding: .5rem .8rem; }}
|
||||
input[type=password] {{ width: 100%; box-sizing: border-box; margin: .5rem 0; }}
|
||||
button {{ cursor: pointer; background: #2563eb; color: #fff; border: none; border-radius: 4px; }}
|
||||
.err {{ color: #dc2626; }}
|
||||
</style></head><body>
|
||||
<h1>🔐 Auto Archiver</h1>
|
||||
<form method="POST" action="/login">
|
||||
<label>Password<br><input type="password" name="password" autofocus required></label><br>
|
||||
<button type="submit">Log in</button>
|
||||
{error}
|
||||
</form></body></html>"""
|
||||
|
||||
|
||||
MAIN_HTML = """<!DOCTYPE html>
|
||||
<html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>Auto Archiver</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; max-width: 700px; margin: 2rem auto; padding: 0 1rem; line-height: 1.6; }}
|
||||
h1 {{ font-size: 1.5rem; }}
|
||||
textarea {{ width: 100%; box-sizing: border-box; font-size: .95rem; font-family: monospace; }}
|
||||
button {{ font-size: 1rem; padding: .5rem 1.2rem; cursor: pointer; background: #2563eb; color: #fff; border: none; border-radius: 4px; margin-top: .5rem; }}
|
||||
table {{ border-collapse: collapse; width: 100%; margin-top: 1rem; }}
|
||||
th, td {{ border: 1px solid #e5e7eb; padding: .4rem .6rem; text-align: left; font-size: .9rem; }}
|
||||
th {{ background: #f9fafb; }}
|
||||
.status {{ padding: 2px 8px; border-radius: 4px; font-size: .85rem; }}
|
||||
.running {{ background: #fef3c7; color: #92400e; }}
|
||||
.done {{ background: #d1fae5; color: #065f46; }}
|
||||
.failed {{ background: #fee2e2; color: #991b1b; }}
|
||||
a {{ color: #2563eb; }}
|
||||
.info {{ color: #6b7280; font-size: .9rem; }}
|
||||
nav {{ display: flex; gap: 1rem; align-items: center; }}
|
||||
nav a {{ text-decoration: none; }}
|
||||
</style></head><body>
|
||||
<nav>
|
||||
<h1>📦 Auto Archiver</h1>
|
||||
<a href="/results">Browse files</a>
|
||||
{logout}
|
||||
</nav>
|
||||
<form method="POST" action="/archive">
|
||||
<label for="urls"><strong>URLs to archive</strong> (one per line)</label><br>
|
||||
<textarea id="urls" name="urls" rows="5" placeholder="https://example.com/post https://youtube.com/watch?v=..." required></textarea><br>
|
||||
<button type="submit">Archive</button>
|
||||
</form>
|
||||
{jobs_html}
|
||||
</body></html>"""
|
||||
|
||||
|
||||
RESULTS_HTML = """<!DOCTYPE html>
|
||||
<html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
<title>Auto Archiver – Files</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; max-width: 700px; margin: 2rem auto; padding: 0 1rem; }}
|
||||
h1 {{ font-size: 1.4rem; }}
|
||||
a {{ color: #2563eb; }}
|
||||
li {{ margin: .3rem 0; font-family: monospace; font-size: .9rem; }}
|
||||
</style></head><body>
|
||||
<h1>📁 Archived Files</h1>
|
||||
<p><a href="/">← Back</a></p>
|
||||
{file_list}
|
||||
</body></html>"""
|
||||
|
||||
|
||||
# ── Routes ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@app.get("/login", response_class=HTMLResponse)
|
||||
async def login_page():
|
||||
if not AUTH_PASSWORD:
|
||||
return RedirectResponse("/", status_code=302)
|
||||
return LOGIN_HTML.format(error="")
|
||||
|
||||
|
||||
@app.post("/login")
|
||||
async def login_submit(password: str = Form(...)):
|
||||
if not AUTH_PASSWORD:
|
||||
return RedirectResponse("/", status_code=302)
|
||||
if password != AUTH_PASSWORD:
|
||||
return HTMLResponse(
|
||||
LOGIN_HTML.format(error='<p class="err">Wrong password.</p>'),
|
||||
status_code=401,
|
||||
)
|
||||
token = secrets.token_urlsafe(32)
|
||||
_valid_sessions.add(token)
|
||||
resp = RedirectResponse("/", status_code=302)
|
||||
resp.set_cookie(COOKIE_NAME, token, httponly=True, samesite="lax", max_age=86400 * 30)
|
||||
return resp
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def index(request: Request, _=Depends(_check_auth)):
|
||||
logout = '<a href="/logout">Logout</a>' if AUTH_PASSWORD else ""
|
||||
jobs_html = _render_jobs()
|
||||
return MAIN_HTML.format(logout=logout, jobs_html=jobs_html)
|
||||
|
||||
|
||||
@app.post("/archive")
|
||||
async def archive(request: Request, urls: str = Form(...), _=Depends(_check_auth)):
|
||||
url_list = [u.strip() for u in urls.strip().splitlines() if u.strip()]
|
||||
if not url_list:
|
||||
raise HTTPException(400, "No URLs provided")
|
||||
|
||||
job = {
|
||||
"id": len(_jobs) + 1,
|
||||
"urls": url_list,
|
||||
"status": "running",
|
||||
"started": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"),
|
||||
"output": "",
|
||||
}
|
||||
_jobs.insert(0, job)
|
||||
|
||||
# Run in background so the user sees the page immediately
|
||||
asyncio.create_task(_run_archive(job))
|
||||
return RedirectResponse("/", status_code=303)
|
||||
|
||||
|
||||
@app.get("/results", response_class=HTMLResponse)
|
||||
async def results(request: Request, _=Depends(_check_auth)):
|
||||
if not ARCHIVE_DIR.exists():
|
||||
return RESULTS_HTML.format(file_list="<p>No archived files yet.</p>")
|
||||
|
||||
files = sorted(ARCHIVE_DIR.rglob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
files = [f for f in files if f.is_file()]
|
||||
|
||||
if not files:
|
||||
return RESULTS_HTML.format(file_list="<p>No archived files yet.</p>")
|
||||
|
||||
items = []
|
||||
for f in files[:200]: # cap listing
|
||||
rel = f.relative_to(ARCHIVE_DIR)
|
||||
items.append(f'<li><a href="/files/{rel}">{html.escape(str(rel))}</a></li>')
|
||||
|
||||
return RESULTS_HTML.format(file_list="<ul>" + "\n".join(items) + "</ul>")
|
||||
|
||||
|
||||
@app.get("/files/{path:path}")
|
||||
async def serve_file(path: str, request: Request, _=Depends(_check_auth)):
|
||||
full = ARCHIVE_DIR / path
|
||||
if not full.exists() or not full.is_file():
|
||||
raise HTTPException(404, "File not found")
|
||||
# Security: ensure the resolved path is within ARCHIVE_DIR
|
||||
try:
|
||||
full.resolve().relative_to(ARCHIVE_DIR.resolve())
|
||||
except ValueError:
|
||||
raise HTTPException(403, "Forbidden")
|
||||
return FileResponse(full)
|
||||
|
||||
|
||||
@app.get("/status")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/logout")
|
||||
async def logout(request: Request):
|
||||
token = request.cookies.get(COOKIE_NAME, "")
|
||||
_valid_sessions.discard(token)
|
||||
resp = RedirectResponse("/login", status_code=302)
|
||||
resp.delete_cookie(COOKIE_NAME)
|
||||
return resp
|
||||
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def _run_archive(job: dict):
|
||||
"""Run auto-archiver as a subprocess for the given URLs."""
|
||||
cmd = [
|
||||
"python3",
|
||||
"-m",
|
||||
"auto_archiver",
|
||||
"--config",
|
||||
str(CONFIG_PATH),
|
||||
] + job["urls"]
|
||||
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
cwd="/app",
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
job["output"] = stdout.decode(errors="replace")[-5000:] # keep last 5k chars
|
||||
job["status"] = "done" if proc.returncode == 0 else "failed"
|
||||
except Exception as e:
|
||||
job["output"] = str(e)
|
||||
job["status"] = "failed"
|
||||
|
||||
|
||||
def _render_jobs() -> str:
|
||||
if not _jobs:
|
||||
return '<p class="info">No archiving jobs yet. Submit URLs above to get started.</p>'
|
||||
|
||||
rows = []
|
||||
for j in _jobs[:50]:
|
||||
urls_str = html.escape(", ".join(j["urls"][:3]))
|
||||
if len(j["urls"]) > 3:
|
||||
urls_str += f" (+{len(j['urls']) - 3} more)"
|
||||
status_cls = j["status"]
|
||||
rows.append(
|
||||
f"<tr><td>{j['id']}</td>"
|
||||
f"<td>{urls_str}</td>"
|
||||
f'<td><span class="status {status_cls}">{j["status"]}</span></td>'
|
||||
f"<td>{j['started']}</td></tr>"
|
||||
)
|
||||
|
||||
return (
|
||||
"<h2>Recent Jobs</h2>"
|
||||
"<table><thead><tr><th>#</th><th>URLs</th><th>Status</th><th>Started</th></tr></thead>"
|
||||
"<tbody>" + "\n".join(rows) + "</tbody></table>"
|
||||
)
|
||||
@@ -6,6 +6,9 @@ services:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: auto-archiver
|
||||
# Override user to match host UID/GID and avoid permission issues on volumes.
|
||||
# Set USER_ID and GROUP_ID env vars, or defaults to 1000:1000.
|
||||
user: "${USER_ID:-1000}:${GROUP_ID:-1000}"
|
||||
volumes:
|
||||
- ./secrets:/app/secrets
|
||||
- ./local_archive:/app/local_archive
|
||||
|
||||
@@ -21,7 +21,7 @@ This allows you to run the auto-archiver without the `poetry run` prefix.
|
||||
### Optional Development Packages
|
||||
|
||||
Install development packages (used for unit tests etc.) using:
|
||||
`poetry install -with dev`
|
||||
`poetry install --with dev`
|
||||
|
||||
|
||||
```{toctree}
|
||||
@@ -33,4 +33,4 @@ docs
|
||||
release
|
||||
settings_page
|
||||
style_guide
|
||||
```
|
||||
```
|
||||
|
||||
@@ -50,7 +50,7 @@ Note not all warnings can be fixed automatically.
|
||||
|
||||
Most fixes are safe, but some non-standard practices such as dynamic loading are not picked up by linters. Ensure you check any modifications by this before committing them.
|
||||
```shell
|
||||
make ruff-fix
|
||||
make ruff-clean
|
||||
```
|
||||
|
||||
**Changing Configurations ⚙️**
|
||||
@@ -67,4 +67,4 @@ One example is to extend the selected rules for linting the `pyproject.toml` fil
|
||||
extend-select = ["B"]
|
||||
```
|
||||
|
||||
Then re-run the `make ruff-check` command to see the new rules in action.
|
||||
Then re-run the `make ruff-check` command to see the new rules in action.
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
## Running Tests
|
||||
|
||||
1. Make sure you've installed the dev dependencies with `pytest install --with dev`
|
||||
1. Make sure you've installed the dev dependencies with `poetry install --with dev`
|
||||
2. Tests can be run as follows:
|
||||
```{code} bash
|
||||
#### Command prefix of 'poetry run' removed here for simplicity
|
||||
@@ -26,7 +26,7 @@ pytest -ra -v tests/test_file.py
|
||||
pytest -ra -v tests/test_file.py::test_function_name
|
||||
```
|
||||
|
||||
3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
|
||||
3. Some tests require environment variables to be set. You can use the example `tests/.env.test.example` file as a template. Copy it to `tests/.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
|
||||
```{code} bash
|
||||
cp .env.test.example .env.test
|
||||
```
|
||||
cp tests/.env.test.example tests/.env.test
|
||||
```
|
||||
|
||||
@@ -4,8 +4,9 @@ Extractor modules are used to extract the content of a given URL. Typically, one
|
||||
|
||||
Extractors that are able to extract content from a wide range of websites include:
|
||||
1. Generic Extractor: parses videos and images on sites using the powerful yt-dlp library.
|
||||
2. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the link.
|
||||
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
|
||||
2. Antibot Extractor: uses a headless browser to bypass bot detection and extract content.
|
||||
3. WACZ Extractor: runs a web browser to 'browse' the URL and save a copy of the page in WACZ format.
|
||||
4. Wayback Machine Extractor: sends pages to the Wayback machine for archiving, and stores the archived link.
|
||||
|
||||
```{include} autogen/extractor.md
|
||||
```
|
||||
|
||||
2947
poetry.lock
generated
2947
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[project]
|
||||
name = "auto-archiver"
|
||||
version = "1.1.1"
|
||||
version = "1.2.5"
|
||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||
|
||||
requires-python = ">=3.10,<3.13"
|
||||
@@ -50,14 +50,15 @@ dependencies = [
|
||||
"retrying (>=0.0.0)",
|
||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||
"rfc3161-client (==1.0.3)",
|
||||
"cryptography (>44.0.1,<45.0.0)",
|
||||
"rfc3161-client (>=1.0.5)",
|
||||
"cryptography (>=46.0.3)",
|
||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||
"yt-dlp[curl-cffi,default] (>=2025.5.22)",
|
||||
"secretstorage (>=3.3.3,<4.0.0)",
|
||||
"seleniumbase (>=4.36.4,<5.0.0)",
|
||||
"pyautogui (>=0.9.54,<0.10.0)",
|
||||
"pyperclip (>=1.9.0)",
|
||||
]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
@@ -65,7 +66,7 @@ pytest = "^8.3.4"
|
||||
autopep8 = "^2.3.1"
|
||||
pytest-loguru = "^0.4.0"
|
||||
pytest-mock = "^3.14.0"
|
||||
ruff = "^0.9.10"
|
||||
ruff = "^0.15.2"
|
||||
pre-commit = "^4.1.0"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
|
||||
99
railway.json
Normal file
99
railway.json
Normal file
@@ -0,0 +1,99 @@
|
||||
{
|
||||
"$schema": "https://railway.app/railway.schema.json",
|
||||
"build": {
|
||||
"dockerfilePath": "deploy/Dockerfile"
|
||||
},
|
||||
"deploy": {
|
||||
"startCommand": "python3 -m deploy.start",
|
||||
"healthcheckPath": "/status",
|
||||
"healthcheckTimeout": 30,
|
||||
"restartPolicyType": "ON_FAILURE",
|
||||
"restartPolicyMaxRetries": 5
|
||||
},
|
||||
"variables": {
|
||||
"AUTH_PASSWORD": {
|
||||
"description": "Password to access your archiver web interface",
|
||||
"required": true
|
||||
},
|
||||
"GSHEET_URL": {
|
||||
"description": "Google Sheet URL to monitor for new URLs (leave empty to disable)",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"GOOGLE_SERVICE_ACCOUNT_JSON": {
|
||||
"description": "Full JSON contents of your Google service account key (required for Sheets)",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"POLL_INTERVAL": {
|
||||
"description": "Seconds between Google Sheet checks (min 60)",
|
||||
"required": false,
|
||||
"default": "300"
|
||||
},
|
||||
"S3_BUCKET": {
|
||||
"description": "S3 bucket name for storage (leave empty for local-only)",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"S3_KEY": {
|
||||
"description": "S3 access key ID",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"S3_SECRET": {
|
||||
"description": "S3 secret access key",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"S3_REGION": {
|
||||
"description": "S3 region (e.g. us-east-1, nyc3 for DO Spaces)",
|
||||
"required": false,
|
||||
"default": "us-east-1"
|
||||
},
|
||||
"S3_ENDPOINT": {
|
||||
"description": "S3 endpoint URL template",
|
||||
"required": false,
|
||||
"default": "https://s3.{region}.amazonaws.com"
|
||||
},
|
||||
"S3_CDN_URL": {
|
||||
"description": "Public CDN URL template for archived files",
|
||||
"required": false,
|
||||
"default": "https://{bucket}.s3.{region}.amazonaws.com/{key}"
|
||||
},
|
||||
"TELEGRAM_API_ID": {
|
||||
"description": "Telegram API ID from https://my.telegram.org",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"TELEGRAM_API_HASH": {
|
||||
"description": "Telegram API hash from https://my.telegram.org",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"TELEGRAM_BOT_TOKEN": {
|
||||
"description": "Telegram bot token from @BotFather",
|
||||
"required": false,
|
||||
"default": ""
|
||||
},
|
||||
"ENABLE_SCREENSHOTS": {
|
||||
"description": "Set to true to capture full-page screenshots",
|
||||
"required": false,
|
||||
"default": "false"
|
||||
},
|
||||
"ENABLE_THUMBNAILS": {
|
||||
"description": "Set to true to generate video thumbnails",
|
||||
"required": false,
|
||||
"default": "false"
|
||||
},
|
||||
"ENABLE_CSV_DB": {
|
||||
"description": "Set to true to save a CSV log of archived items",
|
||||
"required": false,
|
||||
"default": "false"
|
||||
},
|
||||
"LOG_LEVEL": {
|
||||
"description": "Logging level: DEBUG, INFO, WARNING, ERROR",
|
||||
"required": false,
|
||||
"default": "INFO"
|
||||
}
|
||||
}
|
||||
}
|
||||
1108
scripts/settings/package-lock.json
generated
1108
scripts/settings/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -181,6 +181,9 @@ class Metadata:
|
||||
media_hashes = set()
|
||||
new_media = []
|
||||
for m in self.media:
|
||||
if not m.filename:
|
||||
new_media.append(m)
|
||||
continue
|
||||
h = m.get("hash")
|
||||
if not h:
|
||||
h = calculate_hash_in_chunks(hashlib.sha256(), int(1.6e7), m.filename)
|
||||
|
||||
@@ -16,6 +16,7 @@ from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropins.default import DefaultDropin
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.url import is_relevant_url
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
|
||||
|
||||
class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
@@ -72,6 +73,7 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
if self.enrich(result):
|
||||
result.status = "antibot"
|
||||
return result
|
||||
return False
|
||||
|
||||
def _prepare_user_data_dir(self):
|
||||
if self.user_data_dir:
|
||||
@@ -81,11 +83,24 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
os.makedirs(self.user_data_dir, exist_ok=True)
|
||||
|
||||
def enrich(self, to_enrich: Metadata, custom_data_dir: bool = True) -> bool:
|
||||
if to_enrich.get_media_by_id("html_source_code"):
|
||||
logger.info("Antibot has already been executed, skipping.")
|
||||
return True
|
||||
using_user_data_dir = self.user_data_dir if custom_data_dir else None
|
||||
url = to_enrich.get_url()
|
||||
|
||||
# Use xvfb in Docker environments where no display is available
|
||||
use_xvfb = bool(os.environ.get("RUNNING_IN_DOCKER"))
|
||||
|
||||
try:
|
||||
with SB(uc=True, agent=self.agent, headed=None, user_data_dir=using_user_data_dir, proxy=self.proxy) as sb:
|
||||
with SB(
|
||||
uc=True,
|
||||
agent=self.agent,
|
||||
headed=None,
|
||||
user_data_dir=using_user_data_dir,
|
||||
proxy=self.proxy,
|
||||
xvfb=use_xvfb,
|
||||
) as sb:
|
||||
logger.info(f"Selenium browser is up with agent {self.agent}, opening url...")
|
||||
sb.uc_open_with_reconnect(url, 4)
|
||||
|
||||
@@ -94,16 +109,33 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||
|
||||
dropin = self._get_suitable_dropin(url, sb)
|
||||
dropin.open_page(url)
|
||||
if not dropin.open_page(url):
|
||||
# Check for deletion indicators
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
return to_enrich
|
||||
logger.warning("Failed to open drop-in page (not detected as deleted)")
|
||||
return False
|
||||
|
||||
if self.detect_auth_wall and self._hit_auth_wall(sb):
|
||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
||||
return False
|
||||
|
||||
sb.wait_for_ready_state_complete()
|
||||
sb.sleep(1) # margin for the page to load completely
|
||||
|
||||
to_enrich.set_title(sb.get_title())
|
||||
page_title = sb.get_title()
|
||||
html_source = sb.get_page_source()
|
||||
|
||||
# Check if the page indicates content was deleted
|
||||
deletion_info = detect_deletion(html_content=html_source, page_title=page_title, url=url)
|
||||
if deletion_info:
|
||||
flag_as_deleted(to_enrich, deletion_info)
|
||||
|
||||
to_enrich.set_title(page_title)
|
||||
self._enrich_html_source_code(sb, to_enrich)
|
||||
|
||||
self._enrich_full_page_screenshot(sb, to_enrich)
|
||||
@@ -274,8 +306,14 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
||||
return
|
||||
url = to_enrich.get_url()
|
||||
all_urls = set()
|
||||
logger.debug(f"Extracting media for {js_css_selector=}")
|
||||
|
||||
try:
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
except selenium.common.exceptions.JavascriptException as e:
|
||||
logger.error(f"Error executing JavaScript selector {js_css_selector}: {e}")
|
||||
return
|
||||
|
||||
sources = sb.execute_script(js_css_selector)
|
||||
# js_for_css_selectors
|
||||
for src in sources:
|
||||
if len(all_urls) >= max_media:
|
||||
|
||||
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
1
src/auto_archiver/modules/antibot_extractor_enricher/captcha_services/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.py
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
from typing import Mapping
|
||||
@@ -74,8 +75,11 @@ class Dropin:
|
||||
|
||||
You can overwrite this instead of `images_selector` for more control over scraped images.
|
||||
"""
|
||||
if not self.images_selectors():
|
||||
return "return [];"
|
||||
safe_selector = json.dumps(self.images_selectors())
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.images_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def js_for_video_css_selectors(self) -> str:
|
||||
@@ -84,8 +88,11 @@ class Dropin:
|
||||
|
||||
You can overwrite this instead of `video_selector` for more control over scraped videos.
|
||||
"""
|
||||
if not self.video_selectors():
|
||||
return "return [];"
|
||||
safe_selector = json.dumps(self.video_selectors())
|
||||
return f"""
|
||||
return Array.from(document.querySelectorAll("{self.video_selectors()}")).map(el => el.src || el.href).filter(Boolean);
|
||||
return Array.from(document.querySelectorAll({safe_selector})).map(el => el.src || el.href).filter(Boolean);
|
||||
"""
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
@@ -103,6 +110,12 @@ class Dropin:
|
||||
"""
|
||||
return 0, 0
|
||||
|
||||
def hit_auth_wall(self) -> bool:
|
||||
"""
|
||||
Custom check to see if the current page is behind an authentication wall, if True is returned the default global auth wall detector is used instead. If false, no auth wall is detected and the page is considered open.
|
||||
"""
|
||||
return True
|
||||
|
||||
def _get_username_password(self, site) -> tuple[str, str]:
|
||||
"""
|
||||
Get the username and password for the site from the extractor's auth data.
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
from contextlib import suppress
|
||||
from typing import Mapping
|
||||
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||
|
||||
|
||||
class TikTokDropin(Dropin):
|
||||
"""
|
||||
A class to handle TikTok drop-in functionality for the antibot extractor enricher module.
|
||||
|
||||
"""
|
||||
|
||||
def documentation() -> Mapping[str, str]:
|
||||
return {
|
||||
"name": "TikTok Dropin",
|
||||
"description": "Handles TikTok posts and works without authentication.\nNOTE: This dropin is highly susceptible to TikTok's bot detection mechanisms and may not work reliably if you reuse the same IP. The GenericExtractor is recommended for TikTok posts, as it handles video/image download more reliable. In the future we plan to implement better anti captcha measures for this dropin.",
|
||||
"site": "tiktok.com",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def suitable(url: str) -> bool:
|
||||
return "tiktok.com" in url
|
||||
|
||||
@staticmethod
|
||||
def images_selectors() -> str:
|
||||
return '[data-e2e="detail-photo"] img'
|
||||
|
||||
@staticmethod
|
||||
def video_selectors() -> str:
|
||||
return None # TikTok videos should be handled by the generic extractor
|
||||
|
||||
def open_page(self, url) -> bool:
|
||||
self.sb.wait_for_ready_state_complete()
|
||||
self._close_cookies_banner()
|
||||
# TODO: implement login logic
|
||||
if url != self.sb.get_current_url():
|
||||
return False
|
||||
if self.sb.is_text_visible("Video currently unavailable"):
|
||||
logger.debug("Video may have been removed or is private.")
|
||||
return False
|
||||
return True
|
||||
|
||||
def hit_auth_wall(self) -> bool:
|
||||
return False # TikTok does not require authentication for public posts
|
||||
|
||||
def _close_cookies_banner(self):
|
||||
with suppress(Exception): # selenium.common.exceptions.JavascriptException
|
||||
self.sb.execute_script("""
|
||||
document
|
||||
.querySelector("tiktok-cookie-banner")
|
||||
.shadowRoot.querySelector("faceplate-dialog")
|
||||
.querySelector("button")
|
||||
.click()
|
||||
""")
|
||||
self.sb.click_if_visible("Skip")
|
||||
@@ -60,6 +60,10 @@ If you are having issues with the extractor, you can review the version of `yt-d
|
||||
"default": "",
|
||||
"help": "http/https/socks proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
|
||||
},
|
||||
"proxy_on_failure_only": {
|
||||
"default": True,
|
||||
"help": "Applies only if a proxy is set. In that case if this setting is True, the extractor will only use the proxy if the initial request fails; if it is False, the extractor will always use the proxy.",
|
||||
},
|
||||
"end_means_success": {
|
||||
"default": True,
|
||||
"help": "if True, any archived content will mean a 'success', if False this extractor will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent extractors can retrieve.",
|
||||
|
||||
@@ -39,12 +39,18 @@ class Bluesky(GenericDropin):
|
||||
media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
|
||||
for image_media in image_medias:
|
||||
url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
|
||||
image_media = archiver.download_from_url(url)
|
||||
media.append(Media(image_media))
|
||||
filename = archiver.download_from_url(url)
|
||||
if filename:
|
||||
media.append(Media(filename))
|
||||
else:
|
||||
logger.warning(f"Failed to download Bluesky image from {url}")
|
||||
for video_media in video_medias:
|
||||
url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
|
||||
video_media = archiver.download_from_url(url)
|
||||
media.append(Media(video_media))
|
||||
filename = archiver.download_from_url(url)
|
||||
if filename:
|
||||
media.append(Media(filename))
|
||||
else:
|
||||
logger.warning(f"Failed to download Bluesky video from {url}")
|
||||
return media
|
||||
|
||||
def _get_post_data(self, post: dict) -> dict:
|
||||
|
||||
@@ -34,7 +34,7 @@ def _extract_metadata(self, webpage, video_id):
|
||||
...,
|
||||
"attachments",
|
||||
...,
|
||||
lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
|
||||
lambda k, v: k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video",
|
||||
),
|
||||
expected_type=dict,
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ import datetime
|
||||
import os
|
||||
import importlib
|
||||
import subprocess
|
||||
import traceback
|
||||
import zipfile
|
||||
|
||||
from typing import Generator, Type
|
||||
@@ -20,6 +21,7 @@ from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import get_datetime_from_str
|
||||
from auto_archiver.utils.misc import ydl_entry_to_filename
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from .dropin import GenericDropin
|
||||
|
||||
|
||||
@@ -202,8 +204,11 @@ class GenericExtractor(Extractor):
|
||||
if thumbnail_url:
|
||||
try:
|
||||
cover_image_path = self.download_from_url(thumbnail_url)
|
||||
media = Media(cover_image_path)
|
||||
metadata.add_media(media, id="cover")
|
||||
if cover_image_path:
|
||||
media = Media(cover_image_path)
|
||||
metadata.add_media(media, id="cover")
|
||||
else:
|
||||
logger.warning(f"Failed to download cover image from {thumbnail_url}")
|
||||
except Exception as e:
|
||||
logger.error(f"Could not download cover image {thumbnail_url}: {e}")
|
||||
|
||||
@@ -305,9 +310,9 @@ class GenericExtractor(Extractor):
|
||||
result.set_url(url)
|
||||
|
||||
if "description" in video_data and not result.get("content"):
|
||||
result.set_content(video_data.get("description"))
|
||||
result.set_content(video_data.pop("description"))
|
||||
# extract comments if enabled
|
||||
if self.comments and video_data.get("comments", []) is not None:
|
||||
if self.comments and video_data.get("comments", None) is not None:
|
||||
result.set(
|
||||
"comments",
|
||||
[
|
||||
@@ -353,7 +358,7 @@ class GenericExtractor(Extractor):
|
||||
if not dropin:
|
||||
# TODO: add a proper link to 'how to create your own dropin'
|
||||
logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
|
||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
|
||||
Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/generic_extractor.html#dropins""")
|
||||
return False
|
||||
|
||||
post_data = dropin.extract_post(url, ie_instance)
|
||||
@@ -406,9 +411,9 @@ class GenericExtractor(Extractor):
|
||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||
result.add_media(new_media)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing entry {entry}: {e}")
|
||||
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
|
||||
if not len(result.media):
|
||||
logger.info(f"No media found for entry {entry}, skipping.")
|
||||
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
|
||||
return False
|
||||
|
||||
return self.add_metadata(data, info_extractor, url, result)
|
||||
@@ -483,6 +488,13 @@ class GenericExtractor(Extractor):
|
||||
# don't download since it can be a live stream
|
||||
data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
|
||||
|
||||
# Check for deletion indicators in video data
|
||||
deletion_info = detect_deletion(video_data=data, url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
result = _helper_for_successful_extract_info(data, info_extractor, url, ydl)
|
||||
|
||||
except MaxDownloadsReached:
|
||||
@@ -502,6 +514,16 @@ class GenericExtractor(Extractor):
|
||||
try:
|
||||
result = self.get_metadata_for_post(info_extractor, url, ydl)
|
||||
except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
|
||||
# Check if the error indicates deletion
|
||||
deletion_info = detect_deletion(error_message=str(post_e), url=url)
|
||||
if deletion_info:
|
||||
result = Metadata()
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
if "NSFW tweet requires authentication." in str(post_e):
|
||||
logger.warning(str(post_e))
|
||||
return False
|
||||
logger.error("Error downloading metadata for post: {error}", error=str(post_e))
|
||||
return False
|
||||
except Exception as generic_e:
|
||||
@@ -513,7 +535,7 @@ class GenericExtractor(Extractor):
|
||||
)
|
||||
return False
|
||||
|
||||
if result:
|
||||
if result and not result.is_success():
|
||||
extractor_name = "yt-dlp"
|
||||
if info_extractor:
|
||||
extractor_name += f"_{info_extractor.ie_key()}"
|
||||
@@ -525,7 +547,7 @@ class GenericExtractor(Extractor):
|
||||
|
||||
return result
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
def download(self, item: Metadata, skip_proxy: bool = False) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
# TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
|
||||
@@ -533,6 +555,16 @@ class GenericExtractor(Extractor):
|
||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||
item.set("replaced_url", url)
|
||||
|
||||
# proxy_on_failure_only logic
|
||||
if self.proxy and self.proxy_on_failure_only and not skip_proxy:
|
||||
# when proxy_on_failure_only is True, we first try to download without a proxy and only continue with execution if that fails
|
||||
try:
|
||||
if without_proxy := self.download(item, skip_proxy=True):
|
||||
logger.info("Downloaded successfully without proxy.")
|
||||
return without_proxy
|
||||
except Exception:
|
||||
logger.debug("Download without proxy failed, trying with proxy...")
|
||||
|
||||
ydl_options = [
|
||||
"-o",
|
||||
os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
|
||||
@@ -546,7 +578,7 @@ class GenericExtractor(Extractor):
|
||||
]
|
||||
|
||||
# proxy handling
|
||||
if self.proxy:
|
||||
if self.proxy and not skip_proxy:
|
||||
ydl_options.extend(["--proxy", self.proxy])
|
||||
|
||||
# max_downloads handling
|
||||
@@ -591,9 +623,9 @@ class GenericExtractor(Extractor):
|
||||
validated_options
|
||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
result: Metadata = None
|
||||
for info_extractor in self.suitable_extractors(url):
|
||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if result:
|
||||
return result
|
||||
|
||||
return False
|
||||
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
|
||||
if local_result:
|
||||
result = result.merge(local_result) if result else local_result
|
||||
return result if result else False
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
import requests
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
@@ -14,12 +15,16 @@ class Tiktok(GenericDropin):
|
||||
It's useful for capturing content that requires a login, like sensitive content.
|
||||
"""
|
||||
|
||||
# Regex pattern to match TikTok photo post URLs
|
||||
PHOTO_URL_REGEX = r"https?://(?:www\.)?tiktok\.com/@[\w\.-]+/photo/\d+"
|
||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||
|
||||
def suitable(self, url, info_extractor) -> bool:
|
||||
"""This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
|
||||
Return the 'suitable' method from the TikTokIE class."""
|
||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) or (
|
||||
re.match(self.PHOTO_URL_REGEX, url) is not None
|
||||
)
|
||||
|
||||
def extract_post(self, url: str, ie_instance):
|
||||
logger.debug("Using Tikwm API to attempt to download tiktok video")
|
||||
@@ -28,56 +33,91 @@ class Tiktok(GenericDropin):
|
||||
|
||||
r = requests.get(endpoint)
|
||||
if r.status_code != 200:
|
||||
raise ValueError(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
|
||||
raise ValueError(f"Unexpected status code '{r.status_code}' from tikwm.com")
|
||||
|
||||
try:
|
||||
json_response = r.json()
|
||||
except ValueError:
|
||||
raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||
raise ValueError("Failed to parse JSON response from tikwm.com")
|
||||
|
||||
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
||||
raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
|
||||
raise ValueError(f"Unable to download with tikwm.com: {repr(json_response)}")
|
||||
|
||||
# tries to get the non-watermarked version first
|
||||
video_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||
if not video_url:
|
||||
raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
|
||||
|
||||
api_data["video_url"] = video_url
|
||||
play_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||
if play_url and "mime_type=audio" in play_url:
|
||||
play_url = None
|
||||
if play_url:
|
||||
api_data["video_url"] = play_url
|
||||
return api_data
|
||||
|
||||
def keys_to_clean(self, video_data: dict, info_extractor):
|
||||
return ["video_url", "title", "create_time", "author", "cover", "origin_cover", "ai_dynamic_cover", "duration"]
|
||||
return [
|
||||
"video_url",
|
||||
"title",
|
||||
"create_time",
|
||||
"author",
|
||||
"cover",
|
||||
"origin_cover",
|
||||
"ai_dynamic_cover",
|
||||
"duration",
|
||||
"size",
|
||||
"wm_size",
|
||||
"music",
|
||||
"music_info",
|
||||
"play_count",
|
||||
"digg_count",
|
||||
"comment_count",
|
||||
"share_count",
|
||||
"download_count",
|
||||
"collect_count",
|
||||
"anchors",
|
||||
"anchors_extras",
|
||||
"is_ad",
|
||||
"commerce_info",
|
||||
"commercial_video_info",
|
||||
"item_comment_settings",
|
||||
"mentioned_users",
|
||||
] # all of these will be added via api_data in a single metadata field vs individual ones in the generic extractor
|
||||
|
||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||
# prepare result, start by downloading video
|
||||
result = Metadata()
|
||||
video_url = post.pop("video_url")
|
||||
|
||||
is_success = False
|
||||
# get the cover if possible
|
||||
cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
|
||||
if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
|
||||
result.add_media(Media(cover_downloaded))
|
||||
|
||||
# get the video or fail
|
||||
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
||||
if not video_downloaded:
|
||||
logger.error("Failed to download video")
|
||||
return False
|
||||
video_media = Media(video_downloaded)
|
||||
if duration := post.get("duration", None):
|
||||
video_media.set("duration", duration)
|
||||
result.add_media(video_media)
|
||||
for image_url in post.pop("images", []):
|
||||
if image_downloaded := archiver.download_from_url(image_url):
|
||||
result.add_media(Media(image_downloaded))
|
||||
is_success = True # this is an images post and we got it/them
|
||||
|
||||
# get the video if present, could be an image post
|
||||
if video_url := post.pop("video_url", None):
|
||||
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
||||
if not video_downloaded:
|
||||
logger.error("Failed to download video")
|
||||
return False
|
||||
video_media = Media(video_downloaded)
|
||||
if duration := post.pop("duration", None):
|
||||
video_media.set("duration", duration)
|
||||
result.add_media(video_media)
|
||||
is_success = True # this is a video post and we got it
|
||||
|
||||
# add remaining metadata
|
||||
result.set_title(post.get("title", ""))
|
||||
result.set_title(post.pop("title", ""))
|
||||
|
||||
if created_at := post.get("create_time", None):
|
||||
if created_at := post.pop("create_time", None):
|
||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||
|
||||
if author := post.get("author", None):
|
||||
if author := post.pop("author", None):
|
||||
result.set("author", author)
|
||||
|
||||
result.set("api_data", post)
|
||||
|
||||
result.set("api_data", {k: v for k, v in post.items() if v})
|
||||
if is_success:
|
||||
result.success("yt-dlp_TikTok")
|
||||
else:
|
||||
raise ValueError("Unable to download any media from TikTok post, possibly deleted or private.")
|
||||
return result
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Type
|
||||
|
||||
from auto_archiver.utils import traverse_obj
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from yt_dlp.extractor.common import InfoExtractor
|
||||
@@ -58,6 +59,9 @@ class Truth(GenericDropin):
|
||||
# add the media
|
||||
for media in post.get("media_attachments", []):
|
||||
filename = archiver.download_from_url(media["url"])
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download media from {media['url']}")
|
||||
continue
|
||||
result.add_media(Media(filename), id=media.get("id"))
|
||||
|
||||
return result
|
||||
|
||||
@@ -7,7 +7,10 @@ from slugify import slugify
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted
|
||||
from auto_archiver.modules.generic_extractor.dropin import GenericDropin, InfoExtractor
|
||||
import requests
|
||||
from retrying import retry
|
||||
|
||||
|
||||
class Twitter(GenericDropin):
|
||||
@@ -28,7 +31,85 @@ class Twitter(GenericDropin):
|
||||
|
||||
def extract_post(self, url: str, ie_instance: InfoExtractor):
|
||||
twid = ie_instance._match_valid_url(url).group("id")
|
||||
return ie_instance._extract_status(twid=twid)
|
||||
try:
|
||||
post_data = ie_instance._extract_status(twid=twid)
|
||||
if not post_data or not post_data.get("user") or not post_data.get("created_at"):
|
||||
raise ValueError("Error retrieving post with twitter dropin")
|
||||
return post_data
|
||||
except Exception as e:
|
||||
logger.debug(f"yt-dlp twitter extraction failed: {e}")
|
||||
# try fxtwitter API as fallback
|
||||
return self._fetch_fxtwitter(twid)
|
||||
|
||||
def _fetch_fxtwitter(self, twid: str) -> dict:
|
||||
"""Fetch tweet data from fxtwitter API and convert to expected format."""
|
||||
fxtwitter_url = f"https://api.fxtwitter.com/status/{twid}"
|
||||
logger.info(f"Falling back to fxtwitter API for tweet extraction: {fxtwitter_url}")
|
||||
|
||||
@retry(wait_random_min=500, wait_random_max=2000, stop_max_attempt_number=3)
|
||||
def fetch_fxtwitter_data(url):
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"}
|
||||
resp = requests.get(url, headers=headers, timeout=15)
|
||||
if resp.status_code != 200:
|
||||
raise ValueError(f"Failed to retrieve tweet from fxtwitter API: {resp.status_code}")
|
||||
data = resp.json()
|
||||
if "tweet" not in data:
|
||||
raise ValueError(f"No tweet data in fxtwitter response: {data.get('message', 'Unknown error')}")
|
||||
return data["tweet"]
|
||||
|
||||
tweet = fetch_fxtwitter_data(fxtwitter_url)
|
||||
|
||||
# Convert fxtwitter format to expected format
|
||||
author = tweet.get("author", {}).get("name", "")
|
||||
created_at = tweet.get("created_at", "") # Format: "Sun Feb 08 18:45:00 +0000 2026"
|
||||
full_text = tweet.get("text", "") or tweet.get("raw_text", "")
|
||||
|
||||
# Convert media format
|
||||
media = []
|
||||
fx_media = tweet.get("media", {})
|
||||
|
||||
# Handle photos
|
||||
for photo in fx_media.get("photos", []):
|
||||
media.append({"type": "photo", "media_url_https": photo.get("url", "")})
|
||||
|
||||
# Handle videos
|
||||
for video in fx_media.get("videos", []):
|
||||
variants = video.get("variants", [])
|
||||
# Convert to expected variant format
|
||||
converted_variants = []
|
||||
for var in variants:
|
||||
converted_variants.append(
|
||||
{
|
||||
"url": var.get("url", ""),
|
||||
"content_type": var.get("content_type", "video/mp4"),
|
||||
"bitrate": var.get("bitrate", 0),
|
||||
}
|
||||
)
|
||||
if converted_variants:
|
||||
media.append({"type": "video", "video_info": {"variants": converted_variants}})
|
||||
|
||||
# Handle animated gifs (fxtwitter may include these in videos)
|
||||
for item in fx_media.get("all", []):
|
||||
if item.get("type") == "gif":
|
||||
variants = item.get("variants", [])
|
||||
converted_variants = []
|
||||
for var in variants:
|
||||
converted_variants.append(
|
||||
{
|
||||
"url": var.get("url", ""),
|
||||
"content_type": var.get("content_type", "video/mp4"),
|
||||
"bitrate": var.get("bitrate", 0),
|
||||
}
|
||||
)
|
||||
if converted_variants:
|
||||
media.append({"type": "animated_gif", "video_info": {"variants": converted_variants}})
|
||||
|
||||
return {
|
||||
"user": {"name": author},
|
||||
"created_at": created_at,
|
||||
"full_text": full_text,
|
||||
"entities": {"media": media},
|
||||
}
|
||||
|
||||
def keys_to_clean(self, video_data, info_extractor):
|
||||
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
||||
@@ -37,7 +118,15 @@ class Twitter(GenericDropin):
|
||||
result = Metadata()
|
||||
try:
|
||||
if not tweet.get("user") or not tweet.get("created_at"):
|
||||
raise ValueError("Error retreiving post. Are you sure it exists?")
|
||||
# Check for deletion indicators
|
||||
deletion_info = detect_deletion(
|
||||
video_data=tweet, url=url, error_message="Missing user or created_at fields"
|
||||
)
|
||||
if deletion_info:
|
||||
flag_as_deleted(result, deletion_info)
|
||||
return result
|
||||
|
||||
raise ValueError("Error retrieving post. Are you sure it exists?")
|
||||
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
except (ValueError, KeyError) as ex:
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
@@ -68,5 +157,8 @@ class Twitter(GenericDropin):
|
||||
mimetype = variant["content_type"]
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
if not media.filename:
|
||||
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||
continue
|
||||
result.add_media(media)
|
||||
return result
|
||||
|
||||
@@ -32,26 +32,37 @@ class GsheetsFeederDB(Feeder, Database):
|
||||
if not self.sheet and not self.sheet_id:
|
||||
raise ValueError("You need to define either a 'sheet' name or a 'sheet_id' in your manifest.")
|
||||
|
||||
def open_sheet(self):
|
||||
@retry(
|
||||
wait_exponential_multiplier=1,
|
||||
stop_max_attempt_number=6,
|
||||
)
|
||||
def open_sheet(self) -> gspread.Spreadsheet:
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else:
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
@retry(
|
||||
wait_exponential_multiplier=1,
|
||||
stop_max_attempt_number=6,
|
||||
)
|
||||
def enumerate_sheets(self, sheet) -> Iterator[gspread.Worksheet]:
|
||||
for worksheet in sheet.worksheets():
|
||||
yield worksheet
|
||||
|
||||
def __iter__(self) -> Iterator[Metadata]:
|
||||
sh = self.open_sheet()
|
||||
for ii, worksheet in enumerate(sh.worksheets()):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug(f"Skipped worksheet '{worksheet.title}' due to allow/block rules")
|
||||
continue
|
||||
logger.info(f"Opening worksheet {ii=}: {worksheet.title=} header={self.header}")
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.debug(
|
||||
f"Skipped worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}"
|
||||
)
|
||||
continue
|
||||
with logger.contextualize(worksheet=f"{sh.title}:{worksheet.title}"):
|
||||
spreadsheet = self.open_sheet()
|
||||
for worksheet in self.enumerate_sheets(spreadsheet):
|
||||
with logger.contextualize(worksheet=f"{spreadsheet.title}:{worksheet.title}"):
|
||||
if not self.should_process_sheet(worksheet.title):
|
||||
logger.debug("Skipped worksheet due to allow/block rules")
|
||||
continue
|
||||
logger.info(f"Opening worksheet header={self.header}")
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.debug(f"Skipped worksheet due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.info(f"Finished worksheet {worksheet.title}")
|
||||
|
||||
@@ -25,6 +25,9 @@ class HashEnricher(Enricher):
|
||||
logger.debug(f"Calculating media hashes with algo={self.algorithm}")
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if not m.filename:
|
||||
logger.warning(f"Skipping hash for media without filename: {m}")
|
||||
continue
|
||||
if len(hd := self.calculate_hash(m.filename)):
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
|
||||
@@ -99,7 +99,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
result.set_title(user.get("full_name", username)).set("data", user)
|
||||
if pic_url := user.get("profile_pic_url_hd", user.get("profile_pic_url")):
|
||||
filename = self.download_from_url(pic_url)
|
||||
result.add_media(Media(filename=filename), id="profile_picture")
|
||||
if filename:
|
||||
result.add_media(Media(filename=filename), id="profile_picture")
|
||||
else:
|
||||
logger.warning(f"Failed to download profile picture from {pic_url}")
|
||||
|
||||
count_posts = 0
|
||||
if self.full_profile:
|
||||
@@ -202,7 +205,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
if cover_media := h_info.get("cover_media", {}).get("cropped_image_version", {}).get("url"):
|
||||
filename = self.download_from_url(cover_media)
|
||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||
if filename:
|
||||
result.add_media(Media(filename=filename), id=f"cover_media highlight {id}")
|
||||
else:
|
||||
logger.warning(f"Failed to download cover media from {cover_media}")
|
||||
|
||||
items = h_info.get("items", [])[::-1] # newest to oldest
|
||||
items = items[: min(max_to_download, len(items))]
|
||||
@@ -345,7 +351,10 @@ class InstagramAPIExtractor(Extractor):
|
||||
image_media = None
|
||||
if image_url := item.get("thumbnail_url"):
|
||||
filename = self.download_from_url(image_url, verbose=False)
|
||||
image_media = Media(filename=filename)
|
||||
if filename:
|
||||
image_media = Media(filename=filename)
|
||||
else:
|
||||
logger.warning(f"Failed to download thumbnail from {image_url}")
|
||||
|
||||
# retrieve video info
|
||||
best_id = item.get("id", item.get("pk"))
|
||||
@@ -357,16 +366,19 @@ class InstagramAPIExtractor(Extractor):
|
||||
|
||||
if video_url := item.get("video_url"):
|
||||
filename = self.download_from_url(video_url, verbose=False)
|
||||
video_media = Media(filename=filename)
|
||||
if taken_at:
|
||||
video_media.set("date", taken_at)
|
||||
if code:
|
||||
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
||||
if caption_text:
|
||||
video_media.set("text", caption_text)
|
||||
video_media.set("preview", [image_media])
|
||||
video_media.set("data", [item])
|
||||
return item, video_media, f"{context or 'video'} {best_id}"
|
||||
if filename:
|
||||
video_media = Media(filename=filename)
|
||||
if taken_at:
|
||||
video_media.set("date", taken_at)
|
||||
if code:
|
||||
video_media.set("url", f"https://www.instagram.com/p/{code}")
|
||||
if caption_text:
|
||||
video_media.set("text", caption_text)
|
||||
video_media.set("preview", [image_media])
|
||||
video_media.set("data", [item])
|
||||
return item, video_media, f"{context or 'video'} {best_id}"
|
||||
else:
|
||||
logger.warning(f"Failed to download video from {video_url}")
|
||||
elif image_media:
|
||||
if taken_at:
|
||||
image_media.set("date", taken_at)
|
||||
|
||||
@@ -25,6 +25,9 @@ class MetaEnricher(Enricher):
|
||||
logger.debug(f"Calculating archive file sizes for {len(to_enrich.media)} media files")
|
||||
total_size = 0
|
||||
for media in to_enrich.get_all_media():
|
||||
if not media.filename:
|
||||
logger.warning(f"Skipping file size for media without filename: {media}")
|
||||
continue
|
||||
file_stats = os.stat(media.filename)
|
||||
media.set("bytes", file_stats.st_size)
|
||||
media.set("size", self.human_readable_bytes(file_stats.st_size))
|
||||
|
||||
@@ -3,6 +3,13 @@
|
||||
"type": ["enricher"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {"python": ["loguru"], "bin": ["exiftool"]},
|
||||
"configs": {
|
||||
"look_for_keys": {
|
||||
"default": [],
|
||||
"help": "list of lowercased metadata keys that will be included in the enriched metadata. Special keys: 'author', 'datetimes', 'location' to include related metadata fields. The default empty list `[]` means all metadata will be included.",
|
||||
"type": "list",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
Extracts metadata information from files using ExifTool.
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ class MetadataEnricher(Enricher):
|
||||
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if len(md := self.get_metadata(m.filename)):
|
||||
if self.look_for_keys != []:
|
||||
md = self.select_metadata(md, self.look_for_keys)
|
||||
to_enrich.media[i].set("metadata", md)
|
||||
|
||||
def get_metadata(self, filename: str) -> dict:
|
||||
@@ -23,7 +25,6 @@ class MetadataEnricher(Enricher):
|
||||
# Run ExifTool command to extract metadata from the file
|
||||
cmd = ["exiftool", filename]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
# Process the output to extract individual metadata fields
|
||||
metadata = {}
|
||||
for line in result.stdout.splitlines():
|
||||
@@ -35,3 +36,33 @@ class MetadataEnricher(Enricher):
|
||||
except Exception as e:
|
||||
logger.error(f"Error occurred: {e}: {traceback.format_exc()}")
|
||||
return {}
|
||||
|
||||
def select_metadata(self, all_md, requested_metadata_keys):
|
||||
"""
|
||||
coordinates the selection of metadata from the general exiftool output to the user-specified grocery list
|
||||
"""
|
||||
# defining the batches of metadata that get pulled for special terms
|
||||
author_key_terms = ["author", "producer", "creator"]
|
||||
datetime_key_terms = ["date", "time"]
|
||||
location_key_terms = ["gps", "latitude", "longitude"]
|
||||
|
||||
specified_md = {}
|
||||
for md_key in all_md.keys():
|
||||
md_key_lower = md_key.lower()
|
||||
# checking for special baskets within the grocery list of requested metadata
|
||||
if ("author" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in author_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
if ("datetime" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in datetime_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
if ("location" in requested_metadata_keys) and any(
|
||||
term in md_key_lower and len(all_md[md_key]) for term in location_key_terms
|
||||
):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
# if the metadata value is requested directly
|
||||
if md_key_lower in requested_metadata_keys or md_key in requested_metadata_keys and len(all_md[md_key]):
|
||||
specified_md[md_key] = all_md[md_key]
|
||||
return specified_md
|
||||
|
||||
@@ -49,10 +49,18 @@ class TelegramExtractor(Extractor):
|
||||
if not len(image_urls):
|
||||
return False
|
||||
for img_url in image_urls:
|
||||
result.add_media(Media(self.download_from_url(img_url)))
|
||||
filename = self.download_from_url(img_url)
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download image from {img_url}")
|
||||
continue
|
||||
result.add_media(Media(filename))
|
||||
else:
|
||||
video_url = video.get("src")
|
||||
m_video = Media(self.download_from_url(video_url))
|
||||
video_filename = self.download_from_url(video_url)
|
||||
if not video_filename:
|
||||
logger.warning(f"Failed to download video from {video_url}")
|
||||
return False
|
||||
m_video = Media(video_filename)
|
||||
# extract duration from HTML
|
||||
try:
|
||||
duration = s.find_all("time")[0].contents[0]
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import re
|
||||
@@ -53,6 +54,16 @@ class TelethonExtractor(Extractor):
|
||||
logger.debug(f"Making a copy of the session file {base_session_filepath} to {self.session_file}.session")
|
||||
shutil.copy(base_session_filepath, f"{self.session_file}.session")
|
||||
|
||||
# ensure a running event loop exists (Needed when used by Celery workers which may close the default one)
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_closed():
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
# initiate the client
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
|
||||
@@ -190,6 +201,9 @@ class TelethonExtractor(Extractor):
|
||||
)
|
||||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = self.download_from_url(om_url, f"{chat}_{group_id}_{i}")
|
||||
if not filename:
|
||||
logger.warning(f"Failed to download media from {om_url}")
|
||||
continue
|
||||
result.add_media(Media(filename=filename), id=f"{group_id}_{i}")
|
||||
|
||||
filename_dest = os.path.join(self.tmp_dir, f"{chat}_{group_id}", str(mp.id))
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
# "http://tsa.sinpe.fi.cr/tsaHttp/", # self-signed
|
||||
# "http://tsa.cra.ge/signserver/tsa?workerName=qtsa", # self-signed
|
||||
"http://tss.cnbs.gob.hn/TSS/HttpTspServer",
|
||||
"http://dss.nowina.lu/pki-factory/tsa/good-tsa",
|
||||
# "http://dss.nowina.lu/pki-factory/tsa/good-tsa",
|
||||
# "https://freetsa.org/tsr", # self-signed
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
|
||||
@@ -4,12 +4,12 @@ from importlib.metadata import version
|
||||
import hashlib
|
||||
|
||||
from slugify import slugify
|
||||
from retrying import retry
|
||||
import requests
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
|
||||
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
||||
from rfc3161_client import (decode_timestamp_response, TimestampRequestBuilder, TimeStampResponse, VerifierBuilder)
|
||||
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
||||
from rfc3161_client.base import HashAlgorithm
|
||||
from rfc3161_client.tsp import SignedData
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.primitives import serialization
|
||||
@@ -60,7 +60,6 @@ class TimestampingEnricher(Enricher):
|
||||
logger.debug(f"No hashes found")
|
||||
return
|
||||
|
||||
|
||||
hashes_fn = os.path.join(self.tmp_dir, "hashes.txt")
|
||||
|
||||
data_to_sign = "\n".join(hashes)
|
||||
@@ -75,7 +74,7 @@ class TimestampingEnricher(Enricher):
|
||||
|
||||
logger.debug(f"Timestamping with {tsa_url=}")
|
||||
signed: TimeStampResponse = self.sign_data(tsa_url, message)
|
||||
|
||||
|
||||
# fail if there's any issue with the certificates, uses certifi list of trusted CAs or the user-defined `cert_authorities`
|
||||
root_cert = self.verify_signed(signed, message)
|
||||
|
||||
@@ -113,7 +112,7 @@ class TimestampingEnricher(Enricher):
|
||||
f.write(timestamp_token)
|
||||
return tst_path
|
||||
|
||||
def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) -> x509.Certificate:
|
||||
def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) -> x509.Certificate:
|
||||
"""
|
||||
Verify a Signed Timestamp Response is trusted by a known Certificate Authority.
|
||||
|
||||
@@ -136,7 +135,7 @@ class TimestampingEnricher(Enricher):
|
||||
|
||||
if not cert_authorities:
|
||||
raise ValueError(f"No trusted roots found in {trusted_root_path}.")
|
||||
|
||||
|
||||
timestamp_certs = self.tst_certs(timestamp_response)
|
||||
intermediate_certs = timestamp_certs[1:-1]
|
||||
|
||||
@@ -148,7 +147,7 @@ class TimestampingEnricher(Enricher):
|
||||
message_hash = hashlib.sha256(message).digest()
|
||||
else:
|
||||
raise ValueError(f"Unsupported hash algorithm: {hash_algorithm}")
|
||||
|
||||
|
||||
for certificate in cert_authorities:
|
||||
builder = VerifierBuilder()
|
||||
builder.add_root_certificate(certificate)
|
||||
@@ -158,7 +157,6 @@ class TimestampingEnricher(Enricher):
|
||||
|
||||
verifier = builder.build()
|
||||
|
||||
|
||||
try:
|
||||
verifier.verify(timestamp_response, message_hash)
|
||||
return certificate
|
||||
@@ -171,23 +169,38 @@ class TimestampingEnricher(Enricher):
|
||||
# see https://github.com/sigstore/sigstore-python/blob/99948d5b80525a5a104e904ffea58169dc6e0629/sigstore/_internal/timestamp.py#L84-L121
|
||||
|
||||
timestamp_request = (
|
||||
TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
|
||||
)
|
||||
try:
|
||||
TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
|
||||
)
|
||||
|
||||
@retry(
|
||||
wait_exponential_multiplier=1,
|
||||
stop_max_attempt_number=2,
|
||||
)
|
||||
def sign_with_retry():
|
||||
response = self.session.post(tsa_url, data=timestamp_request.as_bytes(), timeout=10)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
try:
|
||||
response = sign_with_retry()
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error while sending request to {tsa_url=}: {e}")
|
||||
raise
|
||||
|
||||
@retry(
|
||||
wait_exponential_multiplier=1,
|
||||
stop_max_attempt_number=2,
|
||||
)
|
||||
def decode_with_retry(response):
|
||||
return decode_timestamp_response(response.content)
|
||||
# Check that we can parse the response but do not *verify* it
|
||||
try:
|
||||
timestamp_response = decode_timestamp_response(response.content)
|
||||
timestamp_response = decode_with_retry(response)
|
||||
except ValueError as e:
|
||||
logger.error(f"Invalid timestamp response from server {tsa_url}: {e}")
|
||||
raise
|
||||
return timestamp_response
|
||||
|
||||
|
||||
def tst_certs(self, tsp_response: TimeStampResponse):
|
||||
signed_data: SignedData = tsp_response.signed_data
|
||||
certs = [x509.load_der_x509_certificate(c) for c in signed_data.certificates]
|
||||
@@ -196,7 +209,7 @@ class TimestampingEnricher(Enricher):
|
||||
if len(certs) == 1:
|
||||
return certs
|
||||
|
||||
while(len(ordered_certs) < len(certs)):
|
||||
while (len(ordered_certs) < len(certs)):
|
||||
if len(ordered_certs) == 0:
|
||||
for cert in certs:
|
||||
if not [c for c in certs if cert.subject == c.issuer]:
|
||||
@@ -220,7 +233,7 @@ class TimestampingEnricher(Enricher):
|
||||
|
||||
cert_chain = []
|
||||
for i, cert in enumerate(certificates):
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{i+1} – {str(cert.serial_number)[:20]}.crt")
|
||||
cert_fn = os.path.join(self.tmp_dir, f"{i + 1} – {str(cert.serial_number)[:20]}.crt")
|
||||
with open(cert_fn, "wb") as f:
|
||||
f.write(cert.public_bytes(encoding=serialization.Encoding.PEM))
|
||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.get_attributes_for_oid(x509.NameOID.COMMON_NAME)[0].value))
|
||||
|
||||
@@ -114,6 +114,9 @@ class TwitterApiExtractor(Extractor):
|
||||
logger.info(f"Found media {media}")
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
|
||||
if not media.filename:
|
||||
logger.warning(f"Failed to download media from {media.get('src')}")
|
||||
continue
|
||||
result.add_media(media)
|
||||
|
||||
result.set_content(
|
||||
|
||||
@@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||
|
||||
self.crawl_id = random_str(8)
|
||||
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
|
||||
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
||||
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||
@@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
|
||||
url = to_enrich.get_url()
|
||||
|
||||
collection = self.crawl_id
|
||||
crawl_id = random_str(8)
|
||||
collection = crawl_id
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||
|
||||
@@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
|
||||
]
|
||||
|
||||
crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id)
|
||||
if self.docker_in_docker:
|
||||
cmd.extend(["--cwd", self.cwd_dind])
|
||||
os.makedirs(crawl_cwd_dind, exist_ok=True)
|
||||
cmd.extend(["--cwd", crawl_cwd_dind])
|
||||
|
||||
if self.auth_for_site(url):
|
||||
# there's an auth for this site, but browsertrix only supports username/password auth
|
||||
@@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
] + cmd
|
||||
|
||||
if self.profile:
|
||||
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
||||
profile_file = f"profile-{crawl_id}.tar.gz"
|
||||
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||
logger.debug(f"Copying {self.profile} to {profile_fn}")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
@@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
return False
|
||||
|
||||
if self.docker_in_docker:
|
||||
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
|
||||
wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz")
|
||||
elif self.use_docker:
|
||||
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
|
||||
else:
|
||||
@@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
self.extract_media_from_wacz(to_enrich, wacz_fn)
|
||||
|
||||
if self.docker_in_docker:
|
||||
jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
||||
jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl")
|
||||
elif self.use_docker:
|
||||
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
|
||||
else:
|
||||
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
import time
|
||||
import requests
|
||||
|
||||
from urllib3.exceptions import MaxRetryError
|
||||
from auto_archiver.core import Extractor, Enricher
|
||||
from auto_archiver.utils import url as UrlUtil
|
||||
from auto_archiver.core import Metadata
|
||||
@@ -45,7 +45,14 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
||||
if self.if_not_archived_within:
|
||||
post_data["if_not_archived_within"] = self.if_not_archived_within
|
||||
# see https://docs.google.com/document/d/1Nsv52MvSjbLb2PCpHlat0gkzw0EvtSgpKHu4mk0MnrA for more options
|
||||
r = requests.post("https://web.archive.org/save/", headers=ia_headers, data=post_data, proxies=proxies)
|
||||
try:
|
||||
r = requests.post("https://web.archive.org/save/", headers=ia_headers, data=post_data, proxies=proxies)
|
||||
except MaxRetryError as e:
|
||||
logger.warning(
|
||||
f"MaxRetryError during Wayback POST call to /save, this may be do to a high number of calls leading to rate limiting: {e}"
|
||||
)
|
||||
to_enrich.set("wayback", "failed: possible rate limit")
|
||||
return False
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(em := f"Internet archive failed with status of {r.status_code}: {r.json()}")
|
||||
@@ -76,6 +83,9 @@ class WaybackExtractorEnricher(Enricher, Extractor):
|
||||
if r_status.status_code == 200 and r_json["status"] == "success":
|
||||
wayback_url = f"https://web.archive.org/web/{r_json['timestamp']}/{r_json['original_url']}"
|
||||
elif r_status.status_code != 200 or r_json["status"] != "pending":
|
||||
if r_json.get("status_ext") in ["error:blocked-url", "error:unauthorized"]:
|
||||
logger.warning("Wayback cannot currently archive the URL, skipping.")
|
||||
to_enrich.set("wayback", r_json.get("status_ext"))
|
||||
logger.error(f"Wayback failed with {r_json}")
|
||||
return False
|
||||
except requests.exceptions.RequestException as e:
|
||||
|
||||
@@ -2,6 +2,13 @@ from loguru import logger
|
||||
import json
|
||||
|
||||
|
||||
def type_serializer(obj):
|
||||
"""Fallback function for objects json can't handle."""
|
||||
if isinstance(obj, type):
|
||||
return obj.__name__
|
||||
return str(obj)
|
||||
|
||||
|
||||
def extract_location(record, short=False):
|
||||
"""Extracts the file name, function name, and line number from the log record."""
|
||||
if short:
|
||||
@@ -35,11 +42,11 @@ def serialize_for_console(record):
|
||||
subset.pop("time", None)
|
||||
if not subset:
|
||||
return ""
|
||||
return json.dumps(subset, ensure_ascii=False)
|
||||
return json.dumps(subset, ensure_ascii=False, default=type_serializer)
|
||||
|
||||
|
||||
def serialize(record):
|
||||
return json.dumps(extract_log_data(record), ensure_ascii=False)
|
||||
return json.dumps(extract_log_data(record), ensure_ascii=False, default=type_serializer)
|
||||
|
||||
|
||||
def patching(record):
|
||||
|
||||
273
src/auto_archiver/utils/deletion_detection.py
Normal file
273
src/auto_archiver/utils/deletion_detection.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
Deletion Detection Utilities
|
||||
|
||||
Provides a best-effort detection of deleted, missing, or unavailable content
|
||||
across various social media platforms based on presence of expected keywords.
|
||||
|
||||
This module helps identify removed content, helps to:
|
||||
- Document content that existed but was deleted
|
||||
- Track patterns of content removal
|
||||
- Preserve metadata about missing content
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, List
|
||||
from auto_archiver.utils.custom_logger import logger
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
class DeletionIndicators:
|
||||
"""
|
||||
Platform-specific indicators that content has been deleted or is unavailable, alongside generic indicators.
|
||||
"""
|
||||
|
||||
# Twitter/X deletion indicators
|
||||
TWITTER = [
|
||||
"Hmm...this page doesn't exist",
|
||||
"Try searching for something else",
|
||||
"This Tweet is unavailable",
|
||||
"This account doesn't exist",
|
||||
"This Tweet has been deleted",
|
||||
"This account has been suspended",
|
||||
"Sorry, that page doesn't exist",
|
||||
"The Tweet you're looking for isn't available",
|
||||
]
|
||||
|
||||
# Facebook deletion indicators
|
||||
FACEBOOK = [
|
||||
"This content isn't available",
|
||||
"Sorry, this content isn't available",
|
||||
"This content is no longer available",
|
||||
"The link you followed may be broken",
|
||||
"Page Not Found",
|
||||
"Content Not Found",
|
||||
"This content is no longer on Facebook",
|
||||
]
|
||||
|
||||
# Instagram deletion indicators
|
||||
INSTAGRAM = [
|
||||
"Sorry, this page isn't available",
|
||||
"The link you followed may be broken",
|
||||
"Media not found or unavailable",
|
||||
"This post is no longer available",
|
||||
"This account is private",
|
||||
]
|
||||
|
||||
# TikTok deletion indicators
|
||||
TIKTOK = [
|
||||
"Couldn't find this account",
|
||||
"This video is no longer available",
|
||||
"This video is currently unavailable",
|
||||
"Video not found",
|
||||
"This video may have been deleted",
|
||||
]
|
||||
|
||||
# YouTube deletion indicators
|
||||
YOUTUBE = [
|
||||
"This video isn't available anymore",
|
||||
"Video unavailable",
|
||||
"This video has been removed",
|
||||
"This video is no longer available",
|
||||
"This video is private",
|
||||
"This video has been removed by the uploader",
|
||||
"This video has been deleted",
|
||||
]
|
||||
|
||||
# Reddit deletion indicators
|
||||
REDDIT = [
|
||||
"this post has been removed",
|
||||
"this comment has been removed",
|
||||
"[removed]",
|
||||
"[deleted]",
|
||||
"page not found",
|
||||
"there doesn't seem to be anything here",
|
||||
]
|
||||
|
||||
# VK deletion indicators
|
||||
VK = [
|
||||
"Post deleted",
|
||||
"Page not found",
|
||||
"Content unavailable",
|
||||
"Access denied",
|
||||
]
|
||||
|
||||
# Telegram deletion indicators
|
||||
TELEGRAM = [
|
||||
"Message not found",
|
||||
"Deleted message",
|
||||
"Channel is private",
|
||||
]
|
||||
|
||||
# Generic indicators (work across platforms)
|
||||
GENERIC = [
|
||||
"has been removed",
|
||||
"no longer available",
|
||||
"content removed",
|
||||
"access denied",
|
||||
"page not found",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def all_indicators(cls) -> List[str]:
|
||||
"""Returns all deletion indicators from all platforms."""
|
||||
return (
|
||||
cls.TWITTER
|
||||
+ cls.FACEBOOK
|
||||
+ cls.INSTAGRAM
|
||||
+ cls.TIKTOK
|
||||
+ cls.YOUTUBE
|
||||
+ cls.REDDIT
|
||||
+ cls.VK
|
||||
+ cls.TELEGRAM
|
||||
+ cls.GENERIC
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def for_url(cls, url: str) -> List[str]:
|
||||
"""Returns platform-specific indicators based on URL domain."""
|
||||
platform = _extract_platform(url)
|
||||
|
||||
indicators_map = {
|
||||
"twitter": cls.TWITTER + cls.GENERIC,
|
||||
"facebook": cls.FACEBOOK + cls.GENERIC,
|
||||
"instagram": cls.INSTAGRAM + cls.GENERIC,
|
||||
"tiktok": cls.TIKTOK + cls.GENERIC,
|
||||
"youtube": cls.YOUTUBE + cls.GENERIC,
|
||||
"reddit": cls.REDDIT + cls.GENERIC,
|
||||
"vk": cls.VK + cls.GENERIC,
|
||||
"telegram": cls.TELEGRAM + cls.GENERIC,
|
||||
}
|
||||
return indicators_map.get(platform, cls.GENERIC)
|
||||
|
||||
|
||||
def detect_deletion(
|
||||
html_content: str = None,
|
||||
page_title: str = None,
|
||||
error_message: str = None,
|
||||
url: str = None,
|
||||
video_data: dict = None,
|
||||
) -> Optional[Dict[str, any]]:
|
||||
"""
|
||||
Best-effort deletion detection across multiple signals.
|
||||
|
||||
Checks HTML content, page titles, error messages, and video metadata for
|
||||
indicators that content has been deleted or is unavailable.
|
||||
|
||||
Args:
|
||||
html_content: Raw HTML source of the page
|
||||
page_title: Browser page title
|
||||
error_message: Any error message from the extractor
|
||||
url: The URL being archived (for platform-specific detection)
|
||||
video_data: Video metadata from yt-dlp or other extractors
|
||||
|
||||
Returns:
|
||||
Dictionary with deletion details if detected, None otherwise.
|
||||
Format: {
|
||||
"is_deleted": True,
|
||||
"indicator": "specific text that was found",
|
||||
"source": "html|title|error|metadata",
|
||||
"platform": "twitter|facebook|etc"
|
||||
}
|
||||
"""
|
||||
|
||||
# Determine indicators to check based on URL
|
||||
if url:
|
||||
indicators = DeletionIndicators.for_url(url)
|
||||
platform = _extract_platform(url)
|
||||
else:
|
||||
indicators = DeletionIndicators.all_indicators()
|
||||
platform = "unknown"
|
||||
|
||||
# Check HTML content
|
||||
if html_content:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in html_content.lower():
|
||||
logger.info(f"Deletion detected in HTML: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "html_content", "platform": platform}
|
||||
|
||||
# Check page title
|
||||
if page_title:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in page_title.lower():
|
||||
logger.info(f"Deletion detected in page title: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "page_title", "platform": platform}
|
||||
|
||||
# Check error messages
|
||||
if error_message:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(error_message).lower():
|
||||
logger.info(f"Deletion detected in error: '{indicator}' found for {url}")
|
||||
return {"is_deleted": True, "indicator": indicator, "source": "error_message", "platform": platform}
|
||||
|
||||
# Check video metadata (from yt-dlp)
|
||||
if video_data:
|
||||
# Check if yt-dlp flagged it as unavailable
|
||||
if video_data.get("availability") in ["unavailable", "private", "deleted"]:
|
||||
logger.info(f"Deletion detected in metadata: availability={video_data.get('availability')}")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": f"availability: {video_data.get('availability')}",
|
||||
"source": "video_metadata",
|
||||
"platform": platform,
|
||||
}
|
||||
|
||||
# Check description/title for deletion indicators
|
||||
for key in ["title", "description", "fulltitle"]:
|
||||
if key in video_data:
|
||||
for indicator in indicators:
|
||||
if indicator.lower() in str(video_data[key]).lower():
|
||||
logger.info(f"Deletion detected in {key}: '{indicator}'")
|
||||
return {
|
||||
"is_deleted": True,
|
||||
"indicator": indicator,
|
||||
"source": f"video_metadata_{key}",
|
||||
"platform": platform,
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_platform(url: str) -> str:
|
||||
"""Extracts platform name from URL."""
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc
|
||||
|
||||
if "twitter.com" in domain or "x.com" in domain:
|
||||
return "twitter"
|
||||
elif "facebook.com" in domain or "fb.com" in domain:
|
||||
return "facebook"
|
||||
elif "instagram.com" in domain:
|
||||
return "instagram"
|
||||
elif "tiktok.com" in domain:
|
||||
return "tiktok"
|
||||
elif "youtube.com" in domain or "youtu.be" in domain:
|
||||
return "youtube"
|
||||
elif "reddit.com" in domain:
|
||||
return "reddit"
|
||||
elif "vk.com" in domain:
|
||||
return "vk"
|
||||
elif "t.me" in domain:
|
||||
return "telegram"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def flag_as_deleted(metadata, deletion_info: Dict[str, any]) -> None:
|
||||
"""
|
||||
Flags metadata object as deleted/unavailable.
|
||||
Adds tentative deletion information to the metadata object.
|
||||
|
||||
Args:
|
||||
metadata: Metadata object to update
|
||||
deletion_info: Dictionary from detect_deletion()
|
||||
"""
|
||||
metadata.set("deletion_detected", True)
|
||||
metadata.set("deletion_indicator", deletion_info.get("indicator"))
|
||||
metadata.set("deletion_source", deletion_info.get("source"))
|
||||
metadata.set("deletion_platform", deletion_info.get("platform"))
|
||||
metadata.status = "deleted_or_unavailable"
|
||||
|
||||
logger.debug(
|
||||
f"Content marked as deleted/unavailable: "
|
||||
f"platform={deletion_info.get('platform')}, "
|
||||
f"indicator='{deletion_info.get('indicator')}', "
|
||||
f"source={deletion_info.get('source')}"
|
||||
)
|
||||
1
tests/core/__init__.py
Normal file
1
tests/core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Core module tests
|
||||
198
tests/core/test_media.py
Normal file
198
tests/core/test_media.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Tests for the Media class from auto_archiver.core.media
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
from auto_archiver.core.media import Media
|
||||
|
||||
|
||||
class TestMediaBasics:
|
||||
"""Test basic Media properties and methods."""
|
||||
|
||||
def test_media_creation_with_filename(self):
|
||||
media = Media(filename="test.mp4")
|
||||
assert media.filename == "test.mp4"
|
||||
assert media.urls == []
|
||||
assert media.properties == {}
|
||||
|
||||
def test_media_key_property(self):
|
||||
media = Media(filename="test.mp4", _key="my_key")
|
||||
assert media.key == "my_key"
|
||||
|
||||
def test_media_set_get_properties(self):
|
||||
media = Media(filename="test.mp4")
|
||||
result = media.set("author", "John Doe")
|
||||
assert result is media # returns self for chaining
|
||||
assert media.get("author") == "John Doe"
|
||||
assert media.get("nonexistent") is None
|
||||
assert media.get("nonexistent", "default") == "default"
|
||||
|
||||
def test_media_add_url(self):
|
||||
media = Media(filename="test.mp4")
|
||||
media.add_url("https://example.com/test.mp4")
|
||||
assert "https://example.com/test.mp4" in media.urls
|
||||
media.add_url("https://cdn.example.com/test.mp4")
|
||||
assert len(media.urls) == 2
|
||||
|
||||
|
||||
class TestMediaMimetype:
|
||||
"""Test mimetype detection and handling."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename,expected_mimetype",
|
||||
[
|
||||
("video.mp4", "video/mp4"),
|
||||
("image.jpg", "image/jpeg"),
|
||||
("image.png", "image/png"),
|
||||
("audio.mp3", "audio/mpeg"),
|
||||
("document.pdf", "application/pdf"),
|
||||
("text.txt", "text/plain"),
|
||||
],
|
||||
)
|
||||
def test_mimetype_detection(self, filename, expected_mimetype):
|
||||
media = Media(filename=filename)
|
||||
assert media.mimetype == expected_mimetype
|
||||
|
||||
def test_mimetype_setter(self):
|
||||
media = Media(filename="file.unknown")
|
||||
media.mimetype = "custom/type"
|
||||
assert media.mimetype == "custom/type"
|
||||
|
||||
def test_mimetype_empty_filename(self):
|
||||
media = Media(filename="")
|
||||
assert media.mimetype == ""
|
||||
|
||||
|
||||
class TestMediaTypeChecks:
|
||||
"""Test media type checking methods."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename,is_video,is_audio,is_image",
|
||||
[
|
||||
("video.mp4", True, False, False),
|
||||
("video.avi", True, False, False),
|
||||
("audio.mp3", False, True, False),
|
||||
("audio.wav", False, True, False),
|
||||
("image.jpg", False, False, True),
|
||||
("image.png", False, False, True),
|
||||
("document.pdf", False, False, False),
|
||||
],
|
||||
)
|
||||
def test_type_checks(self, filename, is_video, is_audio, is_image):
|
||||
media = Media(filename=filename)
|
||||
assert media.is_video() == is_video
|
||||
assert media.is_audio() == is_audio
|
||||
assert media.is_image() == is_image
|
||||
|
||||
|
||||
class TestMediaStore:
|
||||
"""Test media storage functionality."""
|
||||
|
||||
def test_store_with_no_storages(self, caplog):
|
||||
media = Media(filename="test.mp4")
|
||||
metadata = Mock()
|
||||
media.store(metadata, storages=[])
|
||||
assert "No storages found" in caplog.text
|
||||
|
||||
def test_store_with_storage(self):
|
||||
media = Media(filename="test.mp4")
|
||||
metadata = Mock()
|
||||
mock_storage = Mock()
|
||||
media.store(metadata, url="https://example.com", storages=[mock_storage])
|
||||
mock_storage.store.assert_called_once()
|
||||
|
||||
|
||||
class TestMediaInnerMedia:
|
||||
"""Test nested media retrieval."""
|
||||
|
||||
def test_all_inner_media_no_nested(self):
|
||||
media = Media(filename="test.mp4")
|
||||
inner = list(media.all_inner_media(include_self=False))
|
||||
assert len(inner) == 0
|
||||
|
||||
inner_with_self = list(media.all_inner_media(include_self=True))
|
||||
assert len(inner_with_self) == 1
|
||||
assert inner_with_self[0] is media
|
||||
|
||||
def test_all_inner_media_with_nested(self):
|
||||
parent = Media(filename="parent.mp4")
|
||||
child = Media(filename="child.jpg")
|
||||
grandchild = Media(filename="grandchild.png")
|
||||
|
||||
child.set("thumbnail", grandchild)
|
||||
parent.set("preview", child)
|
||||
|
||||
inner = list(parent.all_inner_media(include_self=False))
|
||||
assert len(inner) == 2
|
||||
assert child in inner
|
||||
assert grandchild in inner
|
||||
|
||||
def test_all_inner_media_with_list_property(self):
|
||||
parent = Media(filename="parent.mp4")
|
||||
child1 = Media(filename="frame1.jpg")
|
||||
child2 = Media(filename="frame2.jpg")
|
||||
|
||||
parent.set("frames", [child1, child2])
|
||||
|
||||
inner = list(parent.all_inner_media(include_self=False))
|
||||
assert len(inner) == 2
|
||||
assert child1 in inner
|
||||
assert child2 in inner
|
||||
|
||||
|
||||
class TestMediaIsStored:
|
||||
"""Test the is_stored method."""
|
||||
|
||||
def test_is_stored_no_urls(self):
|
||||
media = Media(filename="test.mp4")
|
||||
storage = Mock()
|
||||
storage.config = {"steps": {"storages": ["s3", "local"]}}
|
||||
assert media.is_stored(storage) is False
|
||||
|
||||
def test_is_stored_partial_urls(self):
|
||||
media = Media(filename="test.mp4")
|
||||
media.add_url("https://s3.example.com/test.mp4")
|
||||
storage = Mock()
|
||||
storage.config = {"steps": {"storages": ["s3", "local"]}}
|
||||
assert media.is_stored(storage) is False
|
||||
|
||||
def test_is_stored_full_urls(self):
|
||||
media = Media(filename="test.mp4")
|
||||
media.add_url("https://s3.example.com/test.mp4")
|
||||
media.add_url("file:///local/test.mp4")
|
||||
storage = Mock()
|
||||
storage.config = {"steps": {"storages": ["s3", "local"]}}
|
||||
assert media.is_stored(storage) is True
|
||||
|
||||
|
||||
class TestMediaValidVideo:
|
||||
"""Test video validation functionality."""
|
||||
|
||||
def test_is_valid_video_with_valid_probe(self):
|
||||
media = Media(filename="test.mp4")
|
||||
|
||||
mock_streams = {"streams": [{"duration_ts": 1000}]}
|
||||
|
||||
with patch("ffmpeg.probe", return_value=mock_streams):
|
||||
assert media.is_valid_video() is True
|
||||
|
||||
def test_is_valid_video_with_no_duration(self):
|
||||
media = Media(filename="test.mp4")
|
||||
|
||||
mock_streams = {"streams": [{"duration_ts": 0}]}
|
||||
|
||||
with patch("ffmpeg.probe", return_value=mock_streams):
|
||||
assert media.is_valid_video() is False
|
||||
|
||||
def test_is_valid_video_with_ffmpeg_error(self):
|
||||
media = Media(filename="test.mp4")
|
||||
|
||||
with patch("ffmpeg.probe", side_effect=Exception("ffmpeg error")):
|
||||
with patch("os.path.getsize", return_value=100):
|
||||
# Falls back to file size check, small file
|
||||
assert media.is_valid_video() is False
|
||||
|
||||
with patch("os.path.getsize", return_value=30000):
|
||||
# Falls back to file size check, larger file
|
||||
assert media.is_valid_video() is True
|
||||
98
tests/core/test_validators.py
Normal file
98
tests/core/test_validators.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Tests for validators module from auto_archiver.core.validators
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core.validators import positive_number, valid_file, json_loader
|
||||
|
||||
|
||||
class TestPositiveNumber:
|
||||
"""Test the positive_number validator."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value,expected",
|
||||
[
|
||||
(0, 0),
|
||||
(1, 1),
|
||||
(100, 100),
|
||||
(0.5, 0.5),
|
||||
(999999, 999999),
|
||||
],
|
||||
)
|
||||
def test_positive_values(self, value, expected):
|
||||
assert positive_number(value) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value",
|
||||
[
|
||||
-1,
|
||||
-100,
|
||||
-0.5,
|
||||
-999999,
|
||||
],
|
||||
)
|
||||
def test_negative_values_raise_error(self, value):
|
||||
with pytest.raises(argparse.ArgumentTypeError) as exc_info:
|
||||
positive_number(value)
|
||||
assert "not a positive number" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestValidFile:
|
||||
"""Test the valid_file validator."""
|
||||
|
||||
def test_valid_file_exists(self, tmp_path):
|
||||
test_file = tmp_path / "test.txt"
|
||||
test_file.write_text("test content")
|
||||
result = valid_file(str(test_file))
|
||||
assert result == str(test_file)
|
||||
|
||||
def test_valid_file_not_exists(self):
|
||||
with pytest.raises(argparse.ArgumentTypeError) as exc_info:
|
||||
valid_file("/nonexistent/path/to/file.txt")
|
||||
assert "does not exist" in str(exc_info.value)
|
||||
|
||||
def test_valid_file_directory_not_file(self, tmp_path):
|
||||
# A directory is not a file
|
||||
with pytest.raises(argparse.ArgumentTypeError) as exc_info:
|
||||
valid_file(str(tmp_path))
|
||||
assert "does not exist" in str(exc_info.value)
|
||||
|
||||
|
||||
class TestJsonLoader:
|
||||
"""Test the json_loader validator."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"json_str,expected",
|
||||
[
|
||||
('{"key": "value"}', {"key": "value"}),
|
||||
('{"number": 123}', {"number": 123}),
|
||||
('{"list": [1, 2, 3]}', {"list": [1, 2, 3]}),
|
||||
('{"nested": {"inner": "value"}}', {"nested": {"inner": "value"}}),
|
||||
("[]", []),
|
||||
("[1, 2, 3]", [1, 2, 3]),
|
||||
('"string"', "string"),
|
||||
("123", 123),
|
||||
("true", True),
|
||||
("false", False),
|
||||
("null", None),
|
||||
],
|
||||
)
|
||||
def test_valid_json(self, json_str, expected):
|
||||
assert json_loader(json_str) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"invalid_json",
|
||||
[
|
||||
"{invalid}",
|
||||
"{'single': 'quotes'}",
|
||||
"{missing: quotes}",
|
||||
'{"unclosed": "brace"',
|
||||
"",
|
||||
],
|
||||
)
|
||||
def test_invalid_json_raises_error(self, invalid_json):
|
||||
with pytest.raises(json.JSONDecodeError):
|
||||
json_loader(invalid_json)
|
||||
62
tests/databases/test_console_db.py
Normal file
62
tests/databases/test_console_db.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Tests for the ConsoleDb module
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def console_db(setup_module):
|
||||
return setup_module("console_db")
|
||||
|
||||
|
||||
class TestConsoleDb:
|
||||
"""Test the ConsoleDb functionality."""
|
||||
|
||||
def test_started_logs_info(self, console_db, make_item, caplog):
|
||||
"""Test that started() logs an info message."""
|
||||
item = make_item("https://example.com/test")
|
||||
|
||||
with caplog.at_level("INFO"):
|
||||
console_db.started(item)
|
||||
|
||||
assert "STARTED" in caplog.text
|
||||
assert "example.com" in caplog.text
|
||||
|
||||
def test_failed_logs_error(self, console_db, make_item, caplog):
|
||||
"""Test that failed() logs an error message with reason."""
|
||||
item = make_item("https://example.com/test")
|
||||
reason = "Connection timeout"
|
||||
|
||||
with caplog.at_level("ERROR"):
|
||||
console_db.failed(item, reason)
|
||||
|
||||
assert "FAILED" in caplog.text
|
||||
assert "Connection timeout" in caplog.text
|
||||
|
||||
def test_aborted_logs_warning(self, console_db, make_item, caplog):
|
||||
"""Test that aborted() logs a warning message."""
|
||||
item = make_item("https://example.com/test")
|
||||
|
||||
with caplog.at_level("WARNING"):
|
||||
console_db.aborted(item)
|
||||
|
||||
assert "ABORTED" in caplog.text
|
||||
|
||||
def test_done_logs_success(self, console_db, make_item, caplog):
|
||||
"""Test that done() logs a success message."""
|
||||
item = make_item("https://example.com/test")
|
||||
|
||||
with caplog.at_level("INFO"):
|
||||
console_db.done(item)
|
||||
|
||||
assert "DONE" in caplog.text
|
||||
|
||||
def test_done_cached(self, console_db, make_item, caplog):
|
||||
"""Test done() with cached=True (should behave the same)."""
|
||||
item = make_item("https://example.com/test")
|
||||
|
||||
with caplog.at_level("INFO"):
|
||||
console_db.done(item, cached=True)
|
||||
|
||||
assert "DONE" in caplog.text
|
||||
72
tests/enrichers/test_json_enricher.py
Normal file
72
tests/enrichers/test_json_enricher.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Tests for the JsonEnricher module
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_enricher(setup_module):
|
||||
return setup_module("json_enricher")
|
||||
|
||||
|
||||
class TestJsonEnricher:
|
||||
"""Test the JsonEnricher functionality."""
|
||||
|
||||
def test_enrich_creates_json_file(self, json_enricher, make_item):
|
||||
"""Test that enrich creates a metadata.json file."""
|
||||
item = make_item("https://example.com/test")
|
||||
item.set("title", "Test Title")
|
||||
item.set("description", "Test description")
|
||||
|
||||
json_enricher.enrich(item)
|
||||
|
||||
# Check that a media with id 'metadata_json' was added
|
||||
json_media = item.get_media_by_id("metadata_json")
|
||||
assert json_media is not None
|
||||
assert json_media.filename.endswith("metadata.json")
|
||||
assert os.path.exists(json_media.filename)
|
||||
|
||||
def test_enrich_json_content(self, json_enricher, make_item):
|
||||
"""Test that the JSON content is correct."""
|
||||
item = make_item("https://example.com/test")
|
||||
item.set("title", "Test Title")
|
||||
item.set("custom_field", "custom_value")
|
||||
|
||||
json_enricher.enrich(item)
|
||||
|
||||
json_media = item.get_media_by_id("metadata_json")
|
||||
with open(json_media.filename, "r", encoding="utf-8") as f:
|
||||
content = json.load(f)
|
||||
|
||||
# The to_dict() returns nested structure: {status, metadata: {...}, media: [...]}
|
||||
assert content["metadata"]["title"] == "Test Title"
|
||||
assert content["metadata"]["custom_field"] == "custom_value"
|
||||
assert content["metadata"]["url"] == "https://example.com/test"
|
||||
|
||||
def test_enrich_handles_special_characters(self, json_enricher, make_item):
|
||||
"""Test that special characters are handled correctly."""
|
||||
item = make_item("https://example.com/test")
|
||||
item.set("title", "Test with émojis 🎉 and üñíçödé")
|
||||
|
||||
json_enricher.enrich(item)
|
||||
|
||||
json_media = item.get_media_by_id("metadata_json")
|
||||
with open(json_media.filename, "r", encoding="utf-8") as f:
|
||||
content = json.load(f)
|
||||
|
||||
# Access the nested metadata structure
|
||||
assert "émojis 🎉" in content["metadata"]["title"]
|
||||
assert "üñíçödé" in content["metadata"]["title"]
|
||||
|
||||
def test_enrich_empty_metadata(self, json_enricher, make_item):
|
||||
"""Test enriching metadata with minimal content."""
|
||||
item = make_item("https://example.com/minimal")
|
||||
|
||||
json_enricher.enrich(item)
|
||||
|
||||
json_media = item.get_media_by_id("metadata_json")
|
||||
assert json_media is not None
|
||||
assert os.path.exists(json_media.filename)
|
||||
@@ -56,6 +56,19 @@ def test_enrich_sets_metadata(enricher, mocker):
|
||||
assert metadata.media == [media1, media2]
|
||||
|
||||
|
||||
def test_enrich_no_metadata_selection(enricher, mocker):
|
||||
media1 = mocker.Mock(filename="img1.jpg")
|
||||
media2 = mocker.Mock(filename="img2.jpg")
|
||||
metadata = mocker.Mock()
|
||||
metadata.media = [media1, media2]
|
||||
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
|
||||
enricher.look_for_keys = ["no-key"]
|
||||
enricher.enrich(metadata)
|
||||
media1.set.assert_called_once_with("metadata", {})
|
||||
media2.set.assert_not_called()
|
||||
assert metadata.media == [media1, media2]
|
||||
|
||||
|
||||
def test_enrich_empty_media(enricher, mocker):
|
||||
metadata = mocker.Mock()
|
||||
metadata.media = []
|
||||
@@ -71,7 +84,9 @@ def test_get_metadata_error_handling(enricher, mocker):
|
||||
assert "Error occurred: " in mock_log.call_args[0][0]
|
||||
|
||||
|
||||
def test_metadata_pickle(enricher, unpickle, mocker):
|
||||
# TODO depends on the expected functionality
|
||||
"""
|
||||
def test_default_metadata_pickle(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
# Uses pickled values
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
@@ -79,6 +94,39 @@ def test_metadata_pickle(enricher, unpickle, mocker):
|
||||
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
|
||||
enricher.enrich(metadata)
|
||||
expected_media = expected.media
|
||||
print(expected_media)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert len(expected_media) == len(actual_media)
|
||||
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")
|
||||
"""
|
||||
|
||||
|
||||
def test_metadata_pickle_megapixel(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||
|
||||
enricher.look_for_keys = ["megapixels"]
|
||||
enricher.enrich(metadata)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert actual_media[0].properties.get("metadata") == {"Megapixels": "0.922"}
|
||||
|
||||
|
||||
def test_metadata_specify_datetime_and_metapixels(enricher, unpickle, mocker):
|
||||
mock_run = mocker.patch("subprocess.run")
|
||||
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
|
||||
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
|
||||
|
||||
enricher.look_for_keys = ["datetime", "megapixels", "image height"]
|
||||
enricher.enrich(metadata)
|
||||
actual_media = metadata.media
|
||||
|
||||
assert actual_media[0].properties.get("metadata") == {
|
||||
"File Modification Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"File Access Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"File Inode Change Date/Time": "2025:02:18 19:42:50+00:00",
|
||||
"Megapixels": "0.922",
|
||||
"Image Height": "720",
|
||||
}
|
||||
|
||||
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
||||
from .test_extractor_base import TestExtractorBase
|
||||
|
||||
|
||||
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||
|
||||
|
||||
class DummySB:
|
||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||
self._url = url
|
||||
@@ -50,15 +53,17 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
}
|
||||
|
||||
@pytest.mark.download
|
||||
@pytest.mark.flaky(reruns=2, reruns_delay=5)
|
||||
@pytest.mark.parametrize(
|
||||
"url,in_title,in_text,image_count,video_count",
|
||||
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||
[
|
||||
(
|
||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||
"western barn owl",
|
||||
"Tyto alba",
|
||||
5,
|
||||
3, # Reduced due to Wikipedia rate limiting (429 errors)
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||
@@ -66,6 +71,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Bellingcat has geolocated",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||
@@ -73,6 +79,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"continued the work of Gazan journalists",
|
||||
5,
|
||||
1,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.bellingcat.com/about/general-information",
|
||||
@@ -80,6 +87,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"Stichting Bellingcat",
|
||||
0, # SVGs are ignored
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||
@@ -87,13 +95,27 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
"16 сентября 1985 года лейблом EMI Records.",
|
||||
5,
|
||||
0,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||
"TikTok",
|
||||
"Dito ko lang",
|
||||
1,
|
||||
0,
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
||||
def test_download_pages_with_media(
|
||||
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||
):
|
||||
"""
|
||||
Test downloading pages with media.
|
||||
"""
|
||||
if CI and skip_ci:
|
||||
pytest.skip("Skipping test in CI environment")
|
||||
|
||||
self.extractor = setup_module(
|
||||
self.extractor_module,
|
||||
self.config
|
||||
@@ -107,6 +129,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
item = make_item(url)
|
||||
result = self.extractor.download(item)
|
||||
|
||||
assert result, f"download() returned {result!r} — Selenium may have failed (e.g., window close timeout)"
|
||||
assert result.status == "antibot", "Expected status to be 'antibot'"
|
||||
|
||||
# Check title contains all required words (case-insensitive)
|
||||
@@ -121,9 +144,9 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
||||
)
|
||||
|
||||
image_media = [m for m in result.media if m.is_image() and not m.get("id") == "screenshot"]
|
||||
assert len(image_media) == image_count, f"Expected {image_count} image items, got {len(image_media)}"
|
||||
assert len(image_media) >= image_count, f"Expected at least {image_count} image items, got {len(image_media)}"
|
||||
video_media = [m for m in result.media if m.is_video()]
|
||||
assert len(video_media) == video_count, f"Expected {video_count} video items, got {len(video_media)}"
|
||||
assert len(video_media) >= video_count, f"Expected at least {video_count} video items, got {len(video_media)}"
|
||||
|
||||
for expected_id in ["screenshot", "pdf", "html_source_code"]:
|
||||
assert any(m.get("id") == expected_id for m in result.media), (
|
||||
|
||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
||||
],
|
||||
)
|
||||
def test_suitable_extractors(self, url, suitable_extractors):
|
||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
||||
def test_bluesky_download_video(self, make_item):
|
||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||
result = self.extractor.download(item)
|
||||
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||
assert result is not False
|
||||
|
||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import date
|
||||
|
||||
@@ -60,3 +61,53 @@ def test_valid_url_regex(url, expected, get_lazy_module):
|
||||
def test_invite_pattern_regex(invite, expected, get_lazy_module):
|
||||
match = TelethonExtractor.invite_pattern.search(invite)
|
||||
assert bool(match) == expected
|
||||
|
||||
|
||||
def test_setup_with_closed_event_loop(get_lazy_module, tmp_path, mocker):
|
||||
"""
|
||||
Simulate the Celery worker scenario where the asyncio event loop is closed
|
||||
before setup() runs. The fix should create a new event loop so that
|
||||
TelegramClient.start() does not raise 'Event loop is closed'.
|
||||
"""
|
||||
# create a session file so setup doesn't fail on missing file
|
||||
session_file = tmp_path / "test.session"
|
||||
session_file.touch()
|
||||
|
||||
# close the current event loop to simulate a Celery worker environment
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.close()
|
||||
|
||||
lazy_module = get_lazy_module("telethon_extractor")
|
||||
module = lazy_module.load(
|
||||
{"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}}
|
||||
)
|
||||
|
||||
# setup should have succeeded and a new open event loop should exist
|
||||
new_loop = asyncio.get_event_loop()
|
||||
assert not new_loop.is_closed()
|
||||
assert module.client is not None
|
||||
|
||||
|
||||
def test_setup_with_no_event_loop(get_lazy_module, tmp_path, mocker):
|
||||
"""
|
||||
Simulate the scenario where there is no current event loop at all
|
||||
(e.g. running in a non-main thread). The fix should create one.
|
||||
"""
|
||||
session_file = tmp_path / "test.session"
|
||||
session_file.touch()
|
||||
|
||||
# Remove the current event loop entirely
|
||||
# In Python 3.12+, get_event_loop() in a non-main thread raises RuntimeError
|
||||
mocker.patch("asyncio.get_event_loop", side_effect=RuntimeError("no current event loop"))
|
||||
new_loop_mock = mocker.MagicMock()
|
||||
new_loop_mock.is_closed.return_value = False
|
||||
mocker.patch("asyncio.new_event_loop", return_value=new_loop_mock)
|
||||
set_loop = mocker.patch("asyncio.set_event_loop")
|
||||
|
||||
lazy_module = get_lazy_module("telethon_extractor")
|
||||
lazy_module.load({"telethon_extractor": {"session_file": str(session_file), "api_id": 123, "api_hash": "ABC"}})
|
||||
|
||||
# a new event loop should have been created and set
|
||||
asyncio.new_event_loop.assert_called_once()
|
||||
set_loop.assert_called_once_with(new_loop_mock)
|
||||
|
||||
@@ -55,6 +55,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
||||
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
||||
("https://tiktok.com/@user/photo/123?lang=en", True),
|
||||
],
|
||||
)
|
||||
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
||||
@@ -68,10 +69,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with caplog.at_level("ERROR"):
|
||||
@@ -79,10 +77,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
assert (
|
||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
||||
in caplog.text
|
||||
)
|
||||
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response",
|
||||
@@ -98,27 +93,30 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||
mock_get.assert_called_once()
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
assert "failed to get a valid response from tikwm.com" in caplog.text
|
||||
assert "Unable to download with tikwm.com: " in caplog.text
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response,has_vid",
|
||||
"response,is_success",
|
||||
[
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
({"data": {"id": 123, "images": []}}, False),
|
||||
({"data": {"wmplay": "url", "images": ["img1.jpg"]}}, True),
|
||||
({"data": {"play": "url", "images": ["img1.jpg"]}}, True),
|
||||
({"data": {"images": ["img1.jpg"]}}, True),
|
||||
],
|
||||
)
|
||||
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
|
||||
def test_correct_extraction(self, mock_get, make_item, response, is_success, mocker):
|
||||
data = {k: v for k, v in response.get("data", {}).items()}
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
if not has_vid:
|
||||
assert result is False
|
||||
else:
|
||||
total_media = len(data.get("images", [])) + (1 if data.get("wmplay", data.get("play")) else 0)
|
||||
if is_success:
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 1
|
||||
assert len(result.media) == total_media
|
||||
else:
|
||||
assert result is False
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 1 + int(has_vid)
|
||||
assert mock_get.call_count == 1 + total_media
|
||||
mock_get.return_value.json.assert_called_once()
|
||||
|
||||
def test_correct_data_extracted(self, mock_get, make_item):
|
||||
@@ -142,7 +140,9 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Title"
|
||||
assert result.get("author") == "Author"
|
||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
||||
assert result.get("other") == "data"
|
||||
assert result.get("comments") is None
|
||||
assert result.get("api_data") == {"id": 123, "other": "data"}
|
||||
assert result.media[1].get("duration") == 60
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||
|
||||
|
||||
238
tests/extractors/test_twitter_dropin.py
Normal file
238
tests/extractors/test_twitter_dropin.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""
|
||||
Tests for the Twitter dropin extractor with fxtwitter fallback
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from auto_archiver.modules.generic_extractor.twitter import Twitter
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def twitter_dropin():
|
||||
return Twitter()
|
||||
|
||||
|
||||
class TestTwitterFxTwitterFallback:
|
||||
"""Test the fxtwitter API fallback functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_fxtwitter_video_response(self):
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "OK",
|
||||
"tweet": {
|
||||
"url": "https://x.com/user/status/123456789",
|
||||
"id": "123456789",
|
||||
"text": "Test tweet with video",
|
||||
"author": {
|
||||
"id": "111",
|
||||
"name": "Test User",
|
||||
"screen_name": "testuser",
|
||||
},
|
||||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||
"media": {
|
||||
"all": [
|
||||
{
|
||||
"type": "video",
|
||||
"url": "https://video.twimg.com/test.mp4",
|
||||
"variants": [
|
||||
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||
{
|
||||
"url": "https://video.twimg.com/test_480.mp4",
|
||||
"content_type": "video/mp4",
|
||||
"bitrate": 632000,
|
||||
},
|
||||
{
|
||||
"url": "https://video.twimg.com/test_720.mp4",
|
||||
"content_type": "video/mp4",
|
||||
"bitrate": 2176000,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"videos": [
|
||||
{
|
||||
"url": "https://video.twimg.com/test.mp4",
|
||||
"variants": [
|
||||
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||
{
|
||||
"url": "https://video.twimg.com/test_480.mp4",
|
||||
"content_type": "video/mp4",
|
||||
"bitrate": 632000,
|
||||
},
|
||||
{
|
||||
"url": "https://video.twimg.com/test_720.mp4",
|
||||
"content_type": "video/mp4",
|
||||
"bitrate": 2176000,
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def mock_fxtwitter_photo_response(self):
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "OK",
|
||||
"tweet": {
|
||||
"url": "https://x.com/user/status/123456790",
|
||||
"id": "123456790",
|
||||
"text": "Test tweet with photo",
|
||||
"author": {
|
||||
"id": "111",
|
||||
"name": "Test User",
|
||||
"screen_name": "testuser",
|
||||
},
|
||||
"created_at": "Mon Feb 09 10:30:00 +0000 2026",
|
||||
"media": {
|
||||
"all": [
|
||||
{
|
||||
"type": "photo",
|
||||
"url": "https://pbs.twimg.com/media/test.jpg?name=orig",
|
||||
}
|
||||
],
|
||||
"photos": [
|
||||
{
|
||||
"type": "photo",
|
||||
"url": "https://pbs.twimg.com/media/test.jpg?name=orig",
|
||||
}
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def test_fetch_fxtwitter_video(self, twitter_dropin, mock_fxtwitter_video_response):
|
||||
"""Test fetching a tweet with video via fxtwitter API."""
|
||||
with patch("requests.get") as mock_get:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = mock_fxtwitter_video_response
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = twitter_dropin._fetch_fxtwitter("123456789")
|
||||
|
||||
assert result["user"]["name"] == "Test User"
|
||||
assert result["created_at"] == "Sun Feb 08 18:45:00 +0000 2026"
|
||||
assert result["full_text"] == "Test tweet with video"
|
||||
assert len(result["entities"]["media"]) == 1
|
||||
assert result["entities"]["media"][0]["type"] == "video"
|
||||
assert "video_info" in result["entities"]["media"][0]
|
||||
assert len(result["entities"]["media"][0]["video_info"]["variants"]) == 3
|
||||
|
||||
def test_fetch_fxtwitter_photo(self, twitter_dropin, mock_fxtwitter_photo_response):
|
||||
"""Test fetching a tweet with photo via fxtwitter API."""
|
||||
with patch("requests.get") as mock_get:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = mock_fxtwitter_photo_response
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = twitter_dropin._fetch_fxtwitter("123456790")
|
||||
|
||||
assert result["user"]["name"] == "Test User"
|
||||
assert result["created_at"] == "Mon Feb 09 10:30:00 +0000 2026"
|
||||
assert result["full_text"] == "Test tweet with photo"
|
||||
assert len(result["entities"]["media"]) == 1
|
||||
assert result["entities"]["media"][0]["type"] == "photo"
|
||||
assert result["entities"]["media"][0]["media_url_https"] == "https://pbs.twimg.com/media/test.jpg?name=orig"
|
||||
|
||||
def test_fetch_fxtwitter_no_media(self, twitter_dropin):
|
||||
"""Test fetching a text-only tweet via fxtwitter API."""
|
||||
mock_response_data = {
|
||||
"code": 200,
|
||||
"message": "OK",
|
||||
"tweet": {
|
||||
"id": "123456791",
|
||||
"text": "Just text, no media",
|
||||
"author": {"name": "Text Only User"},
|
||||
"created_at": "Tue Feb 10 12:00:00 +0000 2026",
|
||||
"media": {},
|
||||
},
|
||||
}
|
||||
with patch("requests.get") as mock_get:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = mock_response_data
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = twitter_dropin._fetch_fxtwitter("123456791")
|
||||
|
||||
assert result["user"]["name"] == "Text Only User"
|
||||
assert result["full_text"] == "Just text, no media"
|
||||
assert result["entities"]["media"] == []
|
||||
|
||||
def test_fetch_fxtwitter_api_error(self, twitter_dropin):
|
||||
"""Test handling of fxtwitter API errors."""
|
||||
with patch("requests.get") as mock_get:
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
with pytest.raises(Exception):
|
||||
twitter_dropin._fetch_fxtwitter("nonexistent")
|
||||
|
||||
|
||||
class TestTwitterChooseVariant:
|
||||
"""Test the video variant selection logic."""
|
||||
|
||||
def test_choose_highest_quality_video(self, twitter_dropin):
|
||||
"""Test that the highest quality video variant is selected."""
|
||||
variants = [
|
||||
{"url": "https://video.twimg.com/vid/320x240/test.mp4", "content_type": "video/mp4"},
|
||||
{"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
|
||||
{"url": "https://video.twimg.com/vid/640x480/test.mp4", "content_type": "video/mp4"},
|
||||
]
|
||||
|
||||
result = twitter_dropin.choose_variant(variants)
|
||||
|
||||
assert result["url"] == "https://video.twimg.com/vid/1280x720/test.mp4"
|
||||
|
||||
def test_choose_variant_fallback_for_non_mp4(self, twitter_dropin):
|
||||
"""Test fallback when no mp4 variant is available."""
|
||||
variants = [
|
||||
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||
]
|
||||
|
||||
result = twitter_dropin.choose_variant(variants)
|
||||
|
||||
assert result["url"] == "https://video.twimg.com/test.m3u8"
|
||||
|
||||
def test_choose_variant_prefers_mp4(self, twitter_dropin):
|
||||
"""Test that mp4 is preferred over other formats when quality is equal."""
|
||||
variants = [
|
||||
{"url": "https://video.twimg.com/test.m3u8", "content_type": "application/x-mpegURL"},
|
||||
{"url": "https://video.twimg.com/vid/1280x720/test.mp4", "content_type": "video/mp4"},
|
||||
]
|
||||
|
||||
result = twitter_dropin.choose_variant(variants)
|
||||
|
||||
assert result["content_type"] == "video/mp4"
|
||||
|
||||
|
||||
@pytest.mark.download
|
||||
class TestTwitterFxTwitterLive:
|
||||
"""Live integration tests for fxtwitter API - requires network access."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tweet_id,expected_media_type",
|
||||
[
|
||||
("2020569571682312581", "video"), # Video tweet
|
||||
("2020410438198890618", "video"), # Video tweet
|
||||
("2020341585502957801", "photo"), # Photo tweet
|
||||
],
|
||||
)
|
||||
def test_fetch_real_tweets(self, twitter_dropin, tweet_id, expected_media_type):
|
||||
"""Test fetching real tweets from fxtwitter API."""
|
||||
result = twitter_dropin._fetch_fxtwitter(tweet_id)
|
||||
|
||||
assert result["user"]["name"] # Author should be non-empty
|
||||
assert result["created_at"] # Should have timestamp
|
||||
assert result["full_text"] # Should have text content
|
||||
|
||||
media = result["entities"]["media"]
|
||||
assert len(media) >= 1
|
||||
assert media[0]["type"] == expected_media_type
|
||||
70
tests/feeders/test_cli_feeder.py
Normal file
70
tests/feeders/test_cli_feeder.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
Tests for the CLIFeeder module
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.modules.cli_feeder.cli_feeder import CLIFeeder
|
||||
from auto_archiver.core.consts import SetupError
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cli_feeder_instance():
|
||||
"""Create a CLIFeeder instance with mocked config."""
|
||||
|
||||
def _create(urls):
|
||||
feeder = CLIFeeder()
|
||||
# Mock the config structure that cli_feeder expects
|
||||
feeder.config = {"urls": urls}
|
||||
feeder.name = "cli_feeder"
|
||||
feeder.tmp_dir = "/tmp"
|
||||
return feeder
|
||||
|
||||
return _create
|
||||
|
||||
|
||||
class TestCLIFeeder:
|
||||
"""Test the CLIFeeder functionality."""
|
||||
|
||||
def test_iter_yields_metadata_for_urls(self, cli_feeder_instance):
|
||||
"""Test that iteration yields Metadata objects for each URL."""
|
||||
urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
|
||||
feeder = cli_feeder_instance(urls)
|
||||
feeder.setup()
|
||||
|
||||
items = list(feeder)
|
||||
|
||||
assert len(items) == 3
|
||||
assert all(isinstance(item, Metadata) for item in items)
|
||||
assert items[0].get_url() == "https://example.com/1"
|
||||
assert items[1].get_url() == "https://example.com/2"
|
||||
assert items[2].get_url() == "https://example.com/3"
|
||||
|
||||
def test_iter_single_url(self, cli_feeder_instance):
|
||||
"""Test iteration with a single URL."""
|
||||
feeder = cli_feeder_instance(["https://example.com/single"])
|
||||
feeder.setup()
|
||||
|
||||
items = list(feeder)
|
||||
|
||||
assert len(items) == 1
|
||||
assert items[0].get_url() == "https://example.com/single"
|
||||
|
||||
def test_setup_raises_without_urls(self, cli_feeder_instance):
|
||||
"""Test that setup raises SetupError when no URLs provided."""
|
||||
feeder = cli_feeder_instance([])
|
||||
|
||||
with pytest.raises(SetupError) as exc_info:
|
||||
feeder.setup()
|
||||
|
||||
assert "No URLs provided" in str(exc_info.value)
|
||||
|
||||
def test_setup_raises_with_none_urls(self, cli_feeder_instance):
|
||||
"""Test that setup raises SetupError when urls is None."""
|
||||
feeder = cli_feeder_instance(None)
|
||||
|
||||
with pytest.raises(SetupError) as exc_info:
|
||||
feeder.setup()
|
||||
|
||||
assert "No URLs provided" in str(exc_info.value)
|
||||
43
tests/formatters/test_mute_formatter.py
Normal file
43
tests/formatters/test_mute_formatter.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Tests for the MuteFormatter module
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mute_formatter(setup_module):
|
||||
return setup_module("mute_formatter")
|
||||
|
||||
|
||||
class TestMuteFormatter:
|
||||
"""Test the MuteFormatter functionality."""
|
||||
|
||||
def test_format_returns_none(self, mute_formatter, make_item):
|
||||
"""Test that format always returns None (mutes output)."""
|
||||
item = make_item("https://example.com/test")
|
||||
item.set("title", "Test Title")
|
||||
|
||||
result = mute_formatter.format(item)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_format_with_empty_metadata(self, mute_formatter):
|
||||
"""Test format with empty metadata."""
|
||||
item = Metadata().set_url("https://example.com/empty")
|
||||
|
||||
result = mute_formatter.format(item)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_format_with_media(self, mute_formatter, make_item):
|
||||
"""Test that format still returns None even with media attached."""
|
||||
from auto_archiver.core.media import Media
|
||||
|
||||
item = make_item("https://example.com/with-media")
|
||||
item.add_media(Media(filename="test.mp4"))
|
||||
|
||||
result = mute_formatter.format(item)
|
||||
|
||||
assert result is None
|
||||
259
tests/test_none_filename_handling.py
Normal file
259
tests/test_none_filename_handling.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Tests for handling Media objects with None filename.
|
||||
|
||||
When download_from_url fails, it returns None. Various enrichers and
|
||||
the metadata deduplication logic must gracefully handle Media objects
|
||||
where filename is None, rather than crashing with TypeError.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.modules.meta_enricher import MetaEnricher
|
||||
|
||||
|
||||
# ── HashEnricher ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestHashEnricherNoneFilename:
|
||||
"""hash_enricher should skip media with None filename without crashing."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self, setup_module):
|
||||
self.enricher = setup_module(HashEnricher, {"algorithm": "SHA-256", "chunksize": 100})
|
||||
|
||||
def test_skips_none_filename(self):
|
||||
m = Metadata().set_url("https://example.com")
|
||||
media = Media(filename=None)
|
||||
media.set("src", "https://example.com/video.mp4")
|
||||
m.add_media(media)
|
||||
|
||||
# Should not raise
|
||||
self.enricher.enrich(m)
|
||||
# No hash should be set
|
||||
assert m.media[0].get("hash") is None
|
||||
|
||||
def test_hashes_valid_skips_none(self, tmp_path):
|
||||
"""Mix of valid and None-filename media: only valid ones get hashed."""
|
||||
valid_file = tmp_path / "test.txt"
|
||||
valid_file.write_text("hello world")
|
||||
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.add_media(Media(filename=str(valid_file)))
|
||||
m.add_media(Media(filename=None))
|
||||
|
||||
self.enricher.enrich(m)
|
||||
|
||||
assert m.media[0].get("hash") is not None
|
||||
assert m.media[1].get("hash") is None
|
||||
|
||||
def test_all_none_filenames(self):
|
||||
"""All media have None filename – enricher should not crash."""
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.add_media(Media(filename=None))
|
||||
m.add_media(Media(filename=None))
|
||||
|
||||
self.enricher.enrich(m)
|
||||
|
||||
assert len(m.media) == 2
|
||||
for media in m.media:
|
||||
assert media.get("hash") is None
|
||||
|
||||
|
||||
# ── MetaEnricher ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestMetaEnricherNoneFilename:
|
||||
"""meta_enricher should skip media with None filename without crashing."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self, setup_module):
|
||||
self.enricher = setup_module(MetaEnricher, {})
|
||||
|
||||
def test_skips_none_filename(self):
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.set("_processed_at", datetime.now(timezone.utc))
|
||||
media = Media(filename=None)
|
||||
media.set("src", "https://example.com/video.mp4")
|
||||
m.add_media(media)
|
||||
|
||||
# Should not raise
|
||||
self.enricher.enrich(m)
|
||||
assert m.get("total_bytes") == 0
|
||||
|
||||
def test_sizes_valid_skips_none(self, tmp_path):
|
||||
"""Mix of valid and None-filename media: only valid ones get sized."""
|
||||
valid_file = tmp_path / "test.txt"
|
||||
valid_file.write_text("A" * 500)
|
||||
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.set("_processed_at", datetime.now(timezone.utc))
|
||||
m.add_media(Media(filename=str(valid_file)))
|
||||
m.add_media(Media(filename=None))
|
||||
|
||||
self.enricher.enrich(m)
|
||||
|
||||
assert m.media[0].get("bytes") == 500
|
||||
assert m.media[1].get("bytes") is None
|
||||
assert m.get("total_bytes") == 500
|
||||
|
||||
|
||||
# ── Metadata.remove_duplicate_media_by_hash ───────────────────────────
|
||||
|
||||
|
||||
class TestRemoveDuplicateMediaNoneFilename:
|
||||
"""remove_duplicate_media_by_hash should keep media with None filename."""
|
||||
|
||||
def test_none_filename_kept(self):
|
||||
m = Metadata().set_url("https://example.com")
|
||||
none_media = Media(filename=None)
|
||||
none_media.set("src", "https://example.com/video.mp4")
|
||||
m.add_media(none_media)
|
||||
|
||||
m.remove_duplicate_media_by_hash()
|
||||
|
||||
assert len(m.media) == 1
|
||||
assert m.media[0].filename is None
|
||||
|
||||
def test_none_and_valid_mixed(self, tmp_path):
|
||||
"""None-filename media is kept alongside valid-filename media."""
|
||||
valid_file = tmp_path / "test.txt"
|
||||
valid_file.write_text("content")
|
||||
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.add_media(Media(filename=str(valid_file)))
|
||||
none_media = Media(filename=None)
|
||||
none_media.set("src", "https://example.com/video.mp4")
|
||||
m.add_media(none_media)
|
||||
|
||||
m.remove_duplicate_media_by_hash()
|
||||
|
||||
assert len(m.media) == 2
|
||||
|
||||
def test_multiple_none_filename_all_kept(self):
|
||||
"""Multiple None-filename media are all kept (can't deduplicate without file)."""
|
||||
m = Metadata().set_url("https://example.com")
|
||||
m.add_media(Media(filename=None))
|
||||
m.add_media(Media(filename=None))
|
||||
|
||||
m.remove_duplicate_media_by_hash()
|
||||
|
||||
assert len(m.media) == 2
|
||||
|
||||
|
||||
# ── Twitter dropin create_metadata ────────────────────────────────────
|
||||
|
||||
|
||||
class TestTwitterDropinNoneFilename:
|
||||
"""Twitter dropin should skip media when download_from_url returns None."""
|
||||
|
||||
@pytest.fixture
|
||||
def twitter_dropin(self):
|
||||
from auto_archiver.modules.generic_extractor.twitter import Twitter
|
||||
|
||||
return Twitter()
|
||||
|
||||
def test_create_metadata_skips_failed_photo_download(self, twitter_dropin):
|
||||
"""When download_from_url returns None for a photo, it's not added to media."""
|
||||
tweet = {
|
||||
"user": {"name": "Test User"},
|
||||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||
"full_text": "Test tweet with photo",
|
||||
"entities": {
|
||||
"media": [
|
||||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
mock_archiver = MagicMock()
|
||||
mock_archiver.download_from_url.return_value = None # simulate failed download
|
||||
|
||||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||
|
||||
# The result should have no media since the download failed
|
||||
assert len(result.media) == 0
|
||||
|
||||
def test_create_metadata_skips_failed_video_download(self, twitter_dropin):
|
||||
"""When download_from_url returns None for a video, it's not added to media."""
|
||||
tweet = {
|
||||
"user": {"name": "Test User"},
|
||||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||
"full_text": "Test tweet with video",
|
||||
"entities": {
|
||||
"media": [
|
||||
{
|
||||
"type": "video",
|
||||
"video_info": {
|
||||
"variants": [
|
||||
{
|
||||
"url": "https://video.twimg.com/vid/1280x720/test.mp4",
|
||||
"content_type": "video/mp4",
|
||||
},
|
||||
]
|
||||
},
|
||||
},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
mock_archiver = MagicMock()
|
||||
mock_archiver.download_from_url.return_value = None
|
||||
|
||||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||
|
||||
assert len(result.media) == 0
|
||||
|
||||
def test_create_metadata_keeps_successful_download(self, twitter_dropin, tmp_path):
|
||||
"""When download_from_url succeeds, media is added."""
|
||||
tweet = {
|
||||
"user": {"name": "Test User"},
|
||||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||
"full_text": "Test tweet with photo",
|
||||
"entities": {
|
||||
"media": [
|
||||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test.jpg"},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
test_file = tmp_path / "test.jpg"
|
||||
test_file.write_text("fake image data")
|
||||
|
||||
mock_archiver = MagicMock()
|
||||
mock_archiver.download_from_url.return_value = str(test_file)
|
||||
|
||||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||
|
||||
assert len(result.media) == 1
|
||||
assert result.media[0].filename == str(test_file)
|
||||
|
||||
def test_create_metadata_mixed_downloads(self, twitter_dropin, tmp_path):
|
||||
"""One download succeeds, one fails – only successful one is kept."""
|
||||
tweet = {
|
||||
"user": {"name": "Test User"},
|
||||
"created_at": "Sun Feb 08 18:45:00 +0000 2026",
|
||||
"full_text": "Test tweet with two photos",
|
||||
"entities": {
|
||||
"media": [
|
||||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test1.jpg"},
|
||||
{"type": "photo", "media_url_https": "https://pbs.twimg.com/media/test2.jpg"},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
test_file = tmp_path / "test1.jpg"
|
||||
test_file.write_text("fake image data")
|
||||
|
||||
mock_archiver = MagicMock()
|
||||
# First call succeeds, second fails
|
||||
mock_archiver.download_from_url.side_effect = [str(test_file), None]
|
||||
|
||||
result = twitter_dropin.create_metadata(tweet, None, mock_archiver, "https://x.com/test/status/123")
|
||||
|
||||
assert len(result.media) == 1
|
||||
assert result.media[0].filename == str(test_file)
|
||||
147
tests/utils/test_deletion_detection.py
Normal file
147
tests/utils/test_deletion_detection.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Tests for deletion detection utilities.
|
||||
|
||||
These tests verify the current best-effort by the auto-archiver
|
||||
to detect when content has been deleted or is unavailable across
|
||||
various platforms.
|
||||
"""
|
||||
|
||||
from auto_archiver.utils.deletion_detection import detect_deletion, flag_as_deleted, DeletionIndicators
|
||||
from auto_archiver.core.metadata import Metadata
|
||||
|
||||
|
||||
class TestDeletionIndicators:
|
||||
"""Test the deletion indicator lists for various platforms."""
|
||||
|
||||
def test_twitter_indicators(self):
|
||||
"""Verify Twitter deletion indicators are comprehensive."""
|
||||
assert "Hmm...this page doesn't exist" in DeletionIndicators.TWITTER
|
||||
assert "Try searching for something else" in DeletionIndicators.TWITTER
|
||||
assert "This Tweet is unavailable" in DeletionIndicators.TWITTER
|
||||
|
||||
def test_platform_specific_indicators(self):
|
||||
"""Test that platform-specific indicators are returned based on URL."""
|
||||
twitter_indicators = DeletionIndicators.for_url("https://twitter.com/user/status/123")
|
||||
assert any("page doesn't exist" in ind.lower() for ind in twitter_indicators)
|
||||
|
||||
instagram_indicators = DeletionIndicators.for_url("https://instagram.com/p/ABC123")
|
||||
assert any("page isn't available" in ind.lower() for ind in instagram_indicators)
|
||||
|
||||
|
||||
class TestDetectDeletion:
|
||||
"""Test the detect_deletion function with various inputs."""
|
||||
|
||||
def test_detect_deletion_in_html_twitter(self):
|
||||
"""Test detection of Twitter's deleted post page."""
|
||||
html = "<html><body>Hmm...this page doesn't exist. Try searching for something else.</body></html>"
|
||||
url = "https://twitter.com/user/status/123"
|
||||
|
||||
result = detect_deletion(html_content=html, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["platform"] == "twitter"
|
||||
assert result["source"] == "html_content"
|
||||
assert "page doesn't exist" in result["indicator"].lower()
|
||||
|
||||
def test_detect_deletion_in_page_title(self):
|
||||
"""Test detection via page title."""
|
||||
title = "Page Not Found"
|
||||
url = "https://facebook.com/post/123"
|
||||
|
||||
result = detect_deletion(page_title=title, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["source"] == "page_title"
|
||||
|
||||
def test_detect_deletion_in_error_message(self):
|
||||
"""Test detection via error messages."""
|
||||
error = "yt_dlp.utils.DownloadError: This video is no longer available"
|
||||
url = "https://youtube.com/watch?v=abc123"
|
||||
|
||||
result = detect_deletion(error_message=error, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["platform"] == "youtube"
|
||||
assert result["source"] == "error_message"
|
||||
|
||||
def test_detect_deletion_in_video_metadata(self):
|
||||
"""Test detection via yt-dlp video metadata."""
|
||||
video_data = {"availability": "unavailable", "title": "Private video"}
|
||||
url = "https://youtube.com/watch?v=test123"
|
||||
|
||||
result = detect_deletion(video_data=video_data, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["is_deleted"] is True
|
||||
assert result["source"] == "video_metadata"
|
||||
assert "availability" in result["indicator"]
|
||||
|
||||
def test_no_deletion_detected(self):
|
||||
"""Test that normal content is not flagged as deleted."""
|
||||
html = "<html><body><h1>Welcome to my page</h1><p>This is normal content.</p></body></html>"
|
||||
title = "My Normal Page"
|
||||
url = "https://example.com/page"
|
||||
|
||||
result = detect_deletion(html_content=html, page_title=title, url=url)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_instagram_media_not_found(self):
|
||||
"""Test Instagram-specific deletion message."""
|
||||
error = "Media not found or unavailable"
|
||||
url = "https://instagram.com/p/ABC123"
|
||||
|
||||
result = detect_deletion(error_message=error, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["platform"] == "instagram"
|
||||
assert "not found" in result["indicator"].lower()
|
||||
|
||||
def test_reddit_removed_content(self):
|
||||
"""Test Reddit [removed] and [deleted] markers."""
|
||||
html = "<div class='comment'>[removed]</div>"
|
||||
url = "https://reddit.com/r/test/comments/abc123"
|
||||
|
||||
result = detect_deletion(html_content=html, url=url)
|
||||
|
||||
assert result is not None
|
||||
assert result["platform"] == "reddit"
|
||||
|
||||
|
||||
class TestFlagAsDeleted:
|
||||
"""Test the flag_as_deleted function."""
|
||||
|
||||
def test_flag_metadata_as_deleted(self):
|
||||
"""Verify that metadata is properly flagged with deletion info."""
|
||||
metadata = Metadata()
|
||||
deletion_info = {
|
||||
"is_deleted": True,
|
||||
"indicator": "This Tweet is unavailable",
|
||||
"source": "html_content",
|
||||
"platform": "twitter",
|
||||
}
|
||||
|
||||
flag_as_deleted(metadata, deletion_info)
|
||||
|
||||
assert metadata.get("deletion_detected") is True
|
||||
assert metadata.get("deletion_indicator") == "This Tweet is unavailable"
|
||||
assert metadata.get("deletion_source") == "html_content"
|
||||
assert metadata.get("deletion_platform") == "twitter"
|
||||
assert metadata.status == "deleted_or_unavailable"
|
||||
|
||||
def test_metadata_contains_deletion_context(self):
|
||||
"""Verify investigators have full context about the deletion."""
|
||||
metadata = Metadata()
|
||||
deletion_info = {
|
||||
"is_deleted": True,
|
||||
"indicator": "Video has been removed by the uploader",
|
||||
"source": "error_message",
|
||||
"platform": "youtube",
|
||||
}
|
||||
|
||||
flag_as_deleted(metadata, deletion_info)
|
||||
assert "deletion_indicator" in metadata.metadata
|
||||
assert "uploader" in metadata.get("deletion_indicator")
|
||||
Reference in New Issue
Block a user