Merge branch 'main' into remove_dependencies

This commit is contained in:
Patrick Robertson
2025-01-14 17:39:20 +01:00
20 changed files with 528 additions and 163 deletions

38
.github/workflows/tests-core.yaml vendored Normal file
View File

@@ -0,0 +1,38 @@
name: Core Tests
on:
push:
branches: [ main ]
paths:
- src/**
pull_request:
paths:
- src/**
jobs:
tests:
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
defaults:
run:
working-directory: ./
steps:
- uses: actions/checkout@v4
- name: Install Poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'poetry'
- name: Install dependencies
run: poetry install --no-interaction --with dev
- name: Run Core Tests
run: poetry run pytest -ra -v -m "not download"

38
.github/workflows/tests-download.yaml vendored Normal file
View File

@@ -0,0 +1,38 @@
name: Download Tests
on:
schedule:
- cron: '35 14 * * 1'
pull_request:
branches: [ main ]
paths:
- src/**
jobs:
tests:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
python-version: ["3.10"] # only run expensive downloads on one (lowest) python version
defaults:
run:
working-directory: ./
steps:
- uses: actions/checkout@v4
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'poetry'
- name: Install dependencies
run: poetry install --no-interaction --with dev
- name: Run Download Tests
run: poetry run pytest -ra -v -m "download"

View File

@@ -2,6 +2,8 @@
[![PyPI version](https://badge.fury.io/py/auto-archiver.svg)](https://badge.fury.io/py/auto-archiver)
[![Docker Image Version (latest by date)](https://img.shields.io/docker/v/bellingcat/auto-archiver?label=version&logo=docker)](https://hub.docker.com/r/bellingcat/auto-archiver)
[![Core Test Status](https://github.com/bellingcat/auto-archiver/workflows/Core%20Tests/badge.svg)](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-core.yaml)
[![Download Test Status](https://github.com/bellingcat/auto-archiver/workflows/Download%20Tests/badge.svg)](https://github.com/bellingcat/auto-archiver/actions/workflows/tests-download.yaml)
<!-- ![Docker Pulls](https://img.shields.io/docker/pulls/bellingcat/auto-archiver) -->
<!-- [![PyPI download month](https://img.shields.io/pypi/dm/auto-archiver.svg)](https://pypi.python.org/pypi/auto-archiver/) -->
<!-- [![Documentation Status](https://readthedocs.org/projects/vk-url-scraper/badge/?version=latest)](https://vk-url-scraper.readthedocs.io/en/latest/?badge=latest) -->
@@ -259,6 +261,20 @@ The "archive location" link contains the path of the archived file, in local sto
## Development
Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run from the local development environment.
### Testing
Tests are split using `pytest.mark` into 'core' and 'download' tests. Download tests will hit the network and make API calls (e.g. Twitter, Bluesky etc.) and should be run regularly to make sure that APIs have not changed.
Tests can be run as follows:
```
# run core tests
pytest -ra -v -m "not download" # or poetry run pytest -ra -v -m "not download"
# run download tests
pytest -ra -v -m "download" # or poetry run pytest -ra -v -m "download"
# run all tests
pytest -ra -v # or poetry run pytest -ra -v
```
#### Docker development
working with docker locally:
* `docker build . -t auto-archiver` to build a local image

152
poetry.lock generated
View File

@@ -209,6 +209,22 @@ files = [
[package.dependencies]
cryptography = "*"
[[package]]
name = "autopep8"
version = "2.3.2"
description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
optional = false
python-versions = ">=3.9"
groups = ["dev"]
files = [
{file = "autopep8-2.3.2-py2.py3-none-any.whl", hash = "sha256:ce8ad498672c845a0c3de2629c15b635ec2b05ef8177a6e7c91c74f3e9b51128"},
{file = "autopep8-2.3.2.tar.gz", hash = "sha256:89440a4f969197b69a995e4ce0661b031f455a9f776d2c5ba3dbd83466931758"},
]
[package.dependencies]
pycodestyle = ">=2.12.0"
tomli = {version = "*", markers = "python_version < \"3.11\""}
[[package]]
name = "beautifulsoup4"
version = "4.12.3"
@@ -245,18 +261,18 @@ files = [
[[package]]
name = "boto3"
version = "1.35.97"
version = "1.35.98"
description = "The AWS SDK for Python"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "boto3-1.35.97-py3-none-any.whl", hash = "sha256:8e49416216a6e3a62c2a0c44fba4dd2852c85472e7b702516605b1363867d220"},
{file = "boto3-1.35.97.tar.gz", hash = "sha256:7d398f66a11e67777c189d1f58c0a75d9d60f98d0ee51b8817e828930bf19e4e"},
{file = "boto3-1.35.98-py3-none-any.whl", hash = "sha256:d0224e1499d7189b47aa7f469d96522d98df6f5702fccb20a95a436582ebcd9d"},
{file = "boto3-1.35.98.tar.gz", hash = "sha256:4b6274b4fe9d7113f978abea66a1f20c8a397c268c9d1b2a6c96b14a256da4a5"},
]
[package.dependencies]
botocore = ">=1.35.97,<1.36.0"
botocore = ">=1.35.98,<1.36.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.10.0,<0.11.0"
@@ -265,14 +281,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
[[package]]
name = "botocore"
version = "1.35.97"
version = "1.35.98"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "botocore-1.35.97-py3-none-any.whl", hash = "sha256:fed4f156b1a9b8ece53738f702ba5851b8c6216b4952de326547f349cc494f14"},
{file = "botocore-1.35.97.tar.gz", hash = "sha256:88f2fab29192ffe2f2115d5bafbbd823ff4b6eb2774296e03ec8b5b0fe074f61"},
{file = "botocore-1.35.98-py3-none-any.whl", hash = "sha256:4f1c0b687488663a774ad3a5e81a5f94fae1bcada2364cfdc48482c4dbf794d5"},
{file = "botocore-1.35.98.tar.gz", hash = "sha256:d11742b3824bdeac3c89eeeaf5132351af41823bbcef8fc15e95c8250b1de09c"},
]
[package.dependencies]
@@ -735,12 +751,12 @@ version = "0.4.6"
description = "Cross-platform colored terminal text."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
groups = ["main"]
markers = "platform_system == \"Windows\" or sys_platform == \"win32\""
groups = ["main", "dev"]
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
markers = {main = "platform_system == \"Windows\" or sys_platform == \"win32\"", dev = "sys_platform == \"win32\""}
[[package]]
name = "cryptography"
@@ -833,7 +849,7 @@ version = "1.2.2"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
groups = ["main"]
groups = ["main", "dev"]
markers = "python_version < \"3.11\""
files = [
{file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
@@ -1025,14 +1041,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
[[package]]
name = "google-api-python-client"
version = "2.158.0"
version = "2.159.0"
description = "Google API Client Library for Python"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_api_python_client-2.158.0-py2.py3-none-any.whl", hash = "sha256:36f8c8d2e79e50f76790ca5946d2f3f8333e210dc8539a6c88e0742416474ad2"},
{file = "google_api_python_client-2.158.0.tar.gz", hash = "sha256:b6664597a9955e04977a62752e33fe44cb35c580e190c1cb08a041893172bd67"},
{file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"},
{file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"},
]
[package.dependencies]
@@ -1225,6 +1241,18 @@ files = [
[package.extras]
all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
[[package]]
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.7"
groups = ["dev"]
files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
[[package]]
name = "instaloader"
version = "4.14"
@@ -1712,7 +1740,7 @@ version = "24.2"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.8"
groups = ["main"]
groups = ["main", "dev"]
files = [
{file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
{file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
@@ -1828,6 +1856,22 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
typing = ["typing-extensions"]
xmp = ["defusedxml"]
[[package]]
name = "pluggy"
version = "1.5.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
{file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
]
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "propcache"
version = "0.2.1"
@@ -1997,6 +2041,18 @@ files = [
[package.dependencies]
pyasn1 = ">=0.4.6,<0.7.0"
[[package]]
name = "pycodestyle"
version = "2.12.1"
description = "Python style guide checker"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"},
{file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"},
]
[[package]]
name = "pycparser"
version = "2.22"
@@ -2125,6 +2181,29 @@ files = [
{file = "pysubs2-1.8.0.tar.gz", hash = "sha256:3397bb58a4a15b1325ba2ae3fd4d7c214e2c0ddb9f33190d6280d783bb433b20"},
]
[[package]]
name = "pytest"
version = "8.3.4"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
{file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=1.5,<2"
tomli = {version = ">=1", markers = "python_version < \"3.11\""}
[package.extras]
dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -2611,6 +2690,49 @@ files = [
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
[[package]]
name = "tomli"
version = "2.2.1"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
markers = "python_version < \"3.11\""
files = [
{file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
{file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a"},
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee"},
{file = "tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e"},
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4"},
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106"},
{file = "tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8"},
{file = "tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff"},
{file = "tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b"},
{file = "tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea"},
{file = "tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8"},
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192"},
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222"},
{file = "tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77"},
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6"},
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd"},
{file = "tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e"},
{file = "tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98"},
{file = "tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4"},
{file = "tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"},
{file = "tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c"},
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13"},
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281"},
{file = "tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272"},
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140"},
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2"},
{file = "tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744"},
{file = "tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec"},
{file = "tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69"},
{file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"},
{file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"},
]
[[package]]
name = "tqdm"
version = "4.67.1"
@@ -3126,4 +3248,4 @@ test = ["pytest (>=8.1,<9.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "08a86c82340c012e316422c69d8d68f894f2e4e04fb6740750494ee4339d5968"
content-hash = "bbf89f011c332d7fb493d14f8644834548baa72661ddaa5ac4e0032da1ebcc50"

View File

@@ -60,10 +60,9 @@ dependencies = [
"toml (>=0.10.2,<0.11.0)"
]
[poetry.group.dev.dependencies]
autopep8 = "*"
[tool.poetry.group.dev.dependencies]
pytest = "^8.3.4"
autopep8 = "^2.3.1"
[project.scripts]
auto-archiver = "auto_archiver.__main__:main"
@@ -71,4 +70,10 @@ auto-archiver = "auto_archiver.__main__:main"
[project.urls]
homepage = "https://github.com/bellingcat/auto-archiver"
repository = "https://github.com/bellingcat/auto-archiver"
documentation = "https://github.com/bellingcat/auto-archiver"
documentation = "https://github.com/bellingcat/auto-archiver"
[tool.pytest.ini_options]
markers = [
"download: marks tests that download content from the network",
]

View File

@@ -114,6 +114,10 @@ class TwitterArchiver(Archiver):
result = Metadata()
tweet = r.json()
if tweet.get('__typename') == 'TweetTombstone':
logger.error(f"Failed to get tweet {tweet_id}: {tweet['tombstone']['text']['text']}")
return False
urls = []
for p in tweet.get("photos", []):
urls.append(p["url"])
@@ -135,7 +139,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-syndication")
@@ -158,7 +162,8 @@ class TwitterArchiver(Archiver):
.set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
return result.success("twitter-ytdl")
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
mimetype = ""

View File

@@ -14,9 +14,26 @@ class HashEnricher(Enricher):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"]
algos = self.configs()["algorithm"]
algo_choices = algos["choices"]
if not getattr(self, 'algorithm', None):
if not config.get('algorithm'):
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
self.algorithm = algos["default"]
else:
self.algorithm = config["algorithm"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
if not getattr(self, 'chunksize', None):
if config.get('chunksize'):
self.chunksize = config["chunksize"]
else:
self.chunksize = self.configs()["chunksize"]["default"]
self.chunksize = int(self.chunksize)
assert self.chunksize >= -1, "read length must be non-negative or -1"
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
@staticmethod

View File

@@ -1,4 +1,6 @@
import unittest
import tempfile
if __name__ == '__main__':
unittest.main()
from auto_archiver.core.context import ArchivingContext
ArchivingContext.reset(full_reset=True)
ArchivingContext.set_tmp_dir(tempfile.gettempdir())

View File

@@ -1,7 +0,0 @@
import tempfile
from auto_archiver.core.context import ArchivingContext
ArchivingContext.reset(full_reset=True)
ArchivingContext.set_tmp_dir(tempfile.gettempdir())

View File

@@ -1,22 +1,27 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.core import Step
from auto_archiver.core.metadata import Metadata
class TestArchiverBase(object):
archiver_class = None
config = None
def setUp(self):
@pytest.fixture(autouse=True)
def setup_archiver(self):
assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
assert self.config is not None, "self.config must be a dict set on the subclass"
self.archiver = self.archiver_class(self.config)
def create_item(self, url, **kwargs):
item = Metadata().set_url(url)
for key, value in kwargs.items():
item.set(key, value)
return item
def assertValidResponseMetadata(self, test_response, title, timestamp):
self.assertTrue(test_response.is_success())
self.assertEqual(title, test_response.get_title())
self.assertTrue(timestamp, test_response.get("timestamp"))
def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
assert test_response is not False
if not status:
assert test_response.is_success()
else:
assert status == test_response.status
assert title == test_response.get_title()
assert timestamp, test_response.get("timestamp")

View File

@@ -1,8 +1,9 @@
import pytest
from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
from .test_archiver_base import TestArchiverBase
import unittest
class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
class TestBlueskyArchiver(TestArchiverBase):
"""Tests Bluesky Archiver
Note that these tests will download API responses from the bluesky API, so they may be slow.
@@ -13,57 +14,60 @@ class TestBlueskyArchiver(TestArchiverBase, unittest.TestCase):
archiver_class = BlueskyArchiver
config = {}
@pytest.mark.download
def test_download_media_with_images(self):
# url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
# just make sure bsky haven't changed their format, images should be under "record/embed/media/images"
# there should be 2 images
self.assertTrue("record" in post)
self.assertTrue("embed" in post["record"])
self.assertTrue("media" in post["record"]["embed"])
self.assertTrue("images" in post["record"]["embed"]["media"])
self.assertEqual(len(post["record"]["embed"]["media"]["images"]), 2)
assert "record" in post
assert "embed" in post["record"]
assert "media" in post["record"]["embed"]
assert "images" in post["record"]["embed"]["media"]
assert len(post["record"]["embed"]["media"]["images"]) == 2
# try downloading the media files
media = self.archiver._download_bsky_embeds(post)
self.assertEqual(len(media), 2)
assert len(media) == 2
# check the IDs
self.assertTrue("bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src'))
self.assertTrue("bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src'))
assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
@pytest.mark.download
def test_download_post_with_single_image(self):
# url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
# just make sure bsky haven't changed their format, images should be under "record/embed/images"
# there should be 1 image
self.assertTrue("record" in post)
self.assertTrue("embed" in post["record"])
self.assertTrue("images" in post["record"]["embed"])
self.assertEqual(len(post["record"]["embed"]["images"]), 1)
assert "record" in post
assert "embed" in post["record"]
assert "images" in post["record"]["embed"]
assert len(post["record"]["embed"]["images"]) == 1
media = self.archiver._download_bsky_embeds(post)
self.assertEqual(len(media), 1)
assert len(media) == 1
# check the ID
self.assertTrue("bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src'))
assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
@pytest.mark.download
def test_download_post_with_video(self):
# url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
# just make sure bsky haven't changed their format, video should be under "record/embed/video"
self.assertTrue("record" in post)
self.assertTrue("embed" in post["record"])
self.assertTrue("video" in post["record"]["embed"])
assert "record" in post
assert "embed" in post["record"]
assert "video" in post["record"]["embed"]
media = self.archiver._download_bsky_embeds(post)
self.assertEqual(len(media), 1)
assert len(media) == 1
# check the ID
self.assertTrue("bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src'))
assert "bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src')

View File

@@ -1,128 +1,140 @@
import unittest
import datetime
import pytest
from auto_archiver.archivers.twitter_archiver import TwitterArchiver
from .test_archiver_base import TestArchiverBase
class TestTwitterArchiver(TestArchiverBase, unittest.TestCase):
class TestTwitterArchiver(TestArchiverBase):
archiver_class = TwitterArchiver
config = {}
@pytest.mark.parametrize("url, expected", [
("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839"), # strip tracking params
("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
])
def test_sanitize_url(self, url, expected):
assert expected == self.archiver.sanitize_url(url)
def test_sanitize_url(self):
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://www.bellingcat.com/category/resources/", False, False)
])
# should expand t.co URLs
t_co_url = "https://t.co/yl3oOJatFp"
t_co_resolved_url = "https://www.bellingcat.com/category/resources/"
self.assertEqual(t_co_resolved_url, self.archiver.sanitize_url(t_co_url))
# shouldn't alter valid x URLs
x_url = "https://x.com/bellingcat/status/1874097816571961839"
self.assertEqual(x_url, self.archiver.sanitize_url(x_url))
# shouldn't alter valid twitter.com URLs
twitter_url = "https://twitter.com/bellingcat/status/1874097816571961839"
self.assertEqual(twitter_url, self.archiver.sanitize_url(twitter_url))
# should strip tracking params
tracking_url = "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
self.assertEqual("https://twitter.com/bellingcat/status/1874097816571961839", self.archiver.sanitize_url(tracking_url))
# shouldn't alter non-twitter/x URLs
test_url = "https://www.bellingcat.com/category/resources/"
self.assertEqual(test_url, self.archiver.sanitize_url(test_url))
# shouldn't strip params from non-twitter/x URLs
test_url = "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"
self.assertEqual(test_url, self.archiver.sanitize_url(test_url))
def test_get_username_tweet_id_from_url(self):
# test valid twitter URL
url = "https://twitter.com/bellingcat/status/1874097816571961839"
username, tweet_id = self.archiver.get_username_tweet_id(url)
self.assertEqual("bellingcat", username)
self.assertEqual("1874097816571961839", tweet_id)
# test valid x URL
url = "https://x.com/bellingcat/status/1874097816571961839"
username, tweet_id = self.archiver.get_username_tweet_id(url)
self.assertEqual("bellingcat", username)
self.assertEqual("1874097816571961839", tweet_id)
# test invalid URL
# TODO: should this return None, False or raise an exception? Right now it returns False
url = "https://www.bellingcat.com/category/resources/"
username, tweet_id = self.archiver.get_username_tweet_id(url)
self.assertFalse(username)
self.assertFalse(tweet_id)
def test_youtube_dlp_archiver(self):
url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_yt_dlp(self.create_item(url), url, "1874097816571961839")
self.assertTrue(post)
self.assertValidResponseMetadata(
post,
"As 2024 comes to a close, heres some examples of what Bellingcat investigated per month in our 10th year! 🧵",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
)
def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
def test_reverse_engineer_token(self):
username, tweet_id = self.archiver.get_username_tweet_id(url)
assert exptected_username == username
assert exptected_tweetid == tweet_id
def test_choose_variants(self):
# taken from the response for url https://x.com/bellingcat/status/1871552600346415571
variant_list = [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'},
{'bitrate': 256000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'},
{'bitrate': 832000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'},
{'bitrate': 2176000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12'}
]
chosen_variant = self.archiver.choose_variant(variant_list)
assert chosen_variant == variant_list[3]
@pytest.mark.parametrize("tweet_id, expected_token", [
("1874097816571961839", "4jjngwkifa"),
("1674700676612386816", "42586mwa3uv"),
("1877747914073620506", "4jv4aahw36n"),
("1876710769913450647", "4jruzjz5lux"),
("1346554693649113090", "39ibqxei7mo")
])
def test_reverse_engineer_token(self, tweet_id, expected_token):
# see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
# and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
for tweet_id, real_token in [
("1874097816571961839", "4jjngwkifa"),
("1674700676612386816", "42586mwa3uv"),
("1877747914073620506", "4jv4aahw36n"),
("1876710769913450647", "4jruzjz5lux"),
("1346554693649113090", "39ibqxei7mo"),]:
generated_token = self.archiver.generate_token(tweet_id)
self.assertEqual(real_token, generated_token)
def test_syndication_archiver(self):
generated_token = self.archiver.generate_token(tweet_id)
assert expected_token == generated_token
@pytest.mark.download
def test_youtube_dlp_archiver(self, make_item):
url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_syndication(self.create_item(url), url, "1874097816571961839")
self.assertTrue(post)
post = self.archiver.download_yt_dlp(make_item(url), url, "1874097816571961839")
assert post
self.assertValidResponseMetadata(
post,
"As 2024 comes to a close, heres some examples of what Bellingcat investigated per month in our 10th year! 🧵",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
"twitter-ytdl"
)
@pytest.mark.download
def test_syndication_archiver(self, make_item):
url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_syndication(make_item(url), url, "1874097816571961839")
assert post
self.assertValidResponseMetadata(
post,
"As 2024 comes to a close, heres some examples of what Bellingcat investigated per month in our 10th year! 🧵",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
)
def test_download_nonexistend_tweet(self):
@pytest.mark.download
def test_download_nonexistend_tweet(self, make_item):
# this tweet does not exist
url = "https://x.com/Bellingcat/status/17197025860711058"
response = self.archiver.download(self.create_item(url))
self.assertFalse(response)
def test_download_malformed_tweetid(self):
response = self.archiver.download(make_item(url))
assert not response
@pytest.mark.download
def test_download_malformed_tweetid(self, make_item):
# this tweet does not exist
url = "https://x.com/Bellingcat/status/1719702586071100058"
response = self.archiver.download(self.create_item(url))
self.assertFalse(response)
response = self.archiver.download(make_item(url))
assert not response
def test_download_media_with_images(self):
# url https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
@pytest.mark.download
def test_download_tweet_no_media(self, make_item):
post = self.archiver.download()
item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
post = self.archiver.download(item)
# just make sure twitter haven't changed their format, images should be under "record/embed/media/images"
# there should be 2 images
self.assertTrue("record" in post)
self.assertTrue("embed" in post["record"])
self.assertTrue("media" in post["record"]["embed"])
self.assertTrue("images" in post["record"]["embed"]["media"])
self.assertEqual(len(post["record"]["embed"]["media"]["images"]), 2)
self.assertValidResponseMetadata(
post,
"Onion rings are just vegetable donuts.",
datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
"twitter-ytdl"
)
@pytest.mark.download
def test_download_video(self, make_item):
url = "https://x.com/bellingcat/status/1871552600346415571"
post = self.archiver.download(make_item(url))
self.assertValidResponseMetadata(
post,
"This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services https://t.co/SfBUq0hSD0 https://t.co/rIHx0WlKp8",
datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
)
# try downloading the media files
media = self.archiver.download(post)
self.assertEqual(len(media), 2)
@pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
@pytest.mark.download
@pytest.mark.parametrize("url, title, timestamp, image_hash", [
("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
])
def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
# check the IDs
self.assertTrue("bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src'))
self.assertTrue("bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src'))
"""Download tweets with sensitive media"""
post = self.archiver.download(make_item(url))
self.assertValidResponseMetadata(
post,
title,
timestamp
)
assert len(post.media) == 1
assert post.media[0].hash == image_hash

12
tests/conftest.py Normal file
View File

@@ -0,0 +1,12 @@
import pytest
from auto_archiver.core.metadata import Metadata
@pytest.fixture
def make_item():
def _make_item(url: str, **kwargs) -> Metadata:
item = Metadata().set_url(url)
for key, value in kwargs.items():
item.set(key, value)
return item
return _make_item

View File

@@ -0,0 +1 @@
test1

View File

@@ -0,0 +1 @@
test2

View File

View File

@@ -0,0 +1,22 @@
from auto_archiver.databases.csv_db import CSVDb
from auto_archiver.core import Metadata
def test_store_item(tmp_path):
"""Tests storing an item in the CSV database"""
temp_db = tmp_path / "temp_db.csv"
db = CSVDb({
"csv_db": {"csv_file": temp_db.as_posix()}
})
item = Metadata().set_url("http://example.com").set_title("Example").set_content("Example content").success("my-archiver")
db.done(item)
with open(temp_db, "r", encoding="utf-8") as f:
assert f.read().strip() == f"status,metadata,media\nmy-archiver: success,\"{{'_processed_at': {repr(item.get('_processed_at'))}, 'url': 'http://example.com', 'title': 'Example', 'content': 'Example content'}}\",[]"
# TODO: csv db doesn't have a fetch method - need to add it (?)
# assert db.fetch(item) == item

View File

@@ -0,0 +1,55 @@
import pytest
from auto_archiver.enrichers.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
("SHA-256", "tests/data/testfile_2.txt", "60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"),
("SHA3-512", "tests/data/testfile_1.txt", "d2d8cc4f369b340130bd2b29b8b54e918b7c260c3279176da9ccaa37c96eb71735fc97568e892dc6220bf4ae0d748edb46bd75622751556393be3f482e6f794e"),
("SHA3-512", "tests/data/testfile_2.txt", "e35970edaa1e0d8af7d948491b2da0450a49fd9cc1e83c5db4c6f175f9550cf341f642f6be8cfb0bfa476e4258e5088c5ad549087bf02811132ac2fa22b734c6")
])
def test_calculate_hash(algorithm, filename, expected_hash):
# test SHA-256
he = HashEnricher({"algorithm": algorithm, "chunksize": 1})
assert he.calculate_hash(filename) == expected_hash
def test_default_config_values():
he = HashEnricher(config={})
assert he.algorithm == "SHA-256"
assert he.chunksize == 16000000
def test_invalid_chunksize():
with pytest.raises(AssertionError):
he = HashEnricher({"chunksize": "-100"})
def test_invalid_algorithm():
with pytest.raises(AssertionError):
HashEnricher({"algorithm": "SHA-123"})
def test_config():
# test default config
c = HashEnricher.configs()
assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]
assert c["algorithm"]["help"] == "hash algorithm to use"
assert c["chunksize"]["help"] == "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"
def test_hash_media():
he = HashEnricher({"algorithm": "SHA-256", "chunksize": 1})
# generate metadata with two test files
m = Metadata().set_url("https://example.com")
# noop - the metadata has no media. Shouldn't fail
he.enrich(m)
m.add_media(Media("tests/data/testfile_1.txt"))
m.add_media(Media("tests/data/testfile_2.txt"))
he.enrich(m)
assert m.media[0].get("hash") == "SHA-256:1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"
assert m.media[1].get("hash") == "SHA-256:60303ae22b998861bce3b28f33eec1be758a213c86c93c076dbe9f558c11c752"

View File

View File

@@ -0,0 +1,17 @@
from auto_archiver.core.context import ArchivingContext
from auto_archiver.formatters.html_formatter import HtmlFormatter
from auto_archiver.core import Metadata, Media
def test_format():
formatter = HtmlFormatter({})
metadata = Metadata().set("content", "Hello, world!").set_url('https://example.com')
final_media = formatter.format(metadata)
assert isinstance(final_media, Media)
assert ".html" in final_media.filename
with open (final_media.filename, "r", encoding="utf-8") as f:
content = f.read()
assert "Hello, world!" in content
assert final_media.mimetype == "text/html"
assert "SHA-256:" in final_media.get('hash')