Compare commits

...

23 Commits

Author SHA1 Message Date
Patrick Robertson
5211c5de18 Merge pull request #210 from bellingcat/logger_fix
Fix issue #200 + Refactor _LAZY_LOADED_MODULES
2025-02-19 15:11:42 +00:00
Erin Clark
6cdefaa751 Merge pull request #194 from bellingcat/tests/add_module_tests
Add unit tests for individual modules.
Includes a couple of small bug fixes and light refactoring.
2025-02-19 13:51:43 +00:00
Patrick Robertson
04507577b6 Version bump 2025-02-19 13:36:50 +00:00
erinhmclark
47a634fc63 Add WACZ, Wayback and local storage tests. 2025-02-19 13:14:08 +00:00
Patrick Robertson
a9802dd004 Remove the global _LAZY_LOADED_MODULES and allow each instance of ArchivingOrchestrator to load its own modules 2025-02-19 12:25:35 +00:00
erinhmclark
a8ffb19325 Fix auth key name for cookies_from_browser. 2025-02-19 10:40:54 +00:00
Patrick Robertson
eb60b271b9 Fix issue #200 2025-02-19 10:35:14 +00:00
erinhmclark
ddf2e76624 Include Atlos Storage __init__.py for module recognition. 2025-02-19 09:24:34 +00:00
erinhmclark
10a5ad62b8 Include Atlos tests, metadata fixture. 2025-02-19 09:18:41 +00:00
erinhmclark
f0fd9bf445 Updates tests to use pytest-mock. 2025-02-18 23:32:03 +00:00
erinhmclark
657fbd357d Merge branch 'main' into tests/add_module_tests 2025-02-18 19:47:47 +00:00
erinhmclark
7b88df72cb Update test_metadata_enricher.py 2025-02-18 19:46:57 +00:00
Patrick Robertson
3c543a3a6a Various fixes for issues with new architecture (#208)
* Add formatters to the TOC - fixes #204

* Add 'steps' settings to the example YAML in the docs. Fixes #206

* Improved docs on authentication architecture

* Fix setting modules on the command line - they now override any module settings in the orchestration as opposed to appending

* Fix tests for gsheet-feeder: add a test service_account.json (note: not real keys in there)

* Rename the command line entrypoint to _command_line_run

Also: make it clear that code implementation should not call this
Make sure the command line entry returns (we don't want a generator)

* Fix unit tests to use now code-entry points

* Version bump

* Move iterating of generator up to __main__

* Breakpoint

* two minor fixes

* Fix unit tests + add new '__main__' entry point implementation test

* Skip youtube tests if running on CI. Should still run them locally

* Fix full implementation run on GH actions

* Fix skipif test for GH Actions CI

* Add skipifs for truth - it blocks GH:

---------

Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2025-02-18 19:10:09 +00:00
erinhmclark
ce5a200d1f Added tests, updated instagram_tbot_extractor.py raise failure. 2025-02-18 12:59:10 +00:00
erinhmclark
f4c623b11b Merge branch 'main' into tests/add_module_tests 2025-02-17 09:03:04 +00:00
Patrick Robertson
6d43bc7d4d Fix generator programmatic setup (#197)
* Fix returning a generator of a generator

* Move download test test to pytest.mark.download
2025-02-15 17:36:44 +00:00
erinhmclark
8ed3ef2f33 Merge branch 'main' into tests/add_module_tests 2025-02-14 12:47:40 +00:00
erinhmclark
71b41dd901 Remove accidental path, yet again. 2025-02-14 10:05:32 +00:00
erinhmclark
b0756a6a34 Remove accidental full path. 2025-02-14 09:57:44 +00:00
erinhmclark
319c1e8f92 Add more tests. 2025-02-14 09:48:37 +00:00
erinhmclark
3fce593aad Merge branch 'main' into tests/add_module_tests 2025-02-12 19:33:29 +00:00
erinhmclark
cbe98c729d Enricher tests 2025-02-12 19:32:40 +00:00
erinhmclark
d9d936c2ca Thumbnail enricher fix seconds to minutes. 2025-02-12 12:22:27 +00:00
62 changed files with 2696 additions and 747 deletions

View File

@@ -1,6 +1,6 @@
# iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table
from pathlib import Path
from auto_archiver.core.module import available_modules
from auto_archiver.core.module import ModuleFactory
from auto_archiver.core.base_module import BaseModule
from ruamel.yaml import YAML
import io
@@ -19,6 +19,19 @@ type_color = {
TABLE_HEADER = ("Option", "Description", "Default", "Type")
EXAMPLE_YAML = """
# steps configuration
steps:
...
{steps_str}
...
# module configuration
...
{config_string}
"""
def generate_module_docs():
yaml = YAML()
SAVE_FOLDER.mkdir(exist_ok=True)
@@ -28,7 +41,7 @@ def generate_module_docs():
configs_cheatsheet = "\n## Configuration Options\n"
configs_cheatsheet += header_row
for module in sorted(available_modules(with_manifest=True), key=lambda x: (x.requires_setup, x.name)):
for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file.
manifest = module.manifest
@@ -45,11 +58,14 @@ def generate_module_docs():
```
{description}
"""
steps_str = "\n".join(f" {t}s:\n - {module.name}" for t in manifest['type'])
if not manifest['configs']:
readme_str += "\n*This module has no configuration options.*\n"
config_string = f"# No configuration options for {module.name}.*\n"
else:
config_yaml = {}
config_table = header_row
config_yaml = {}
for key, value in manifest['configs'].items():
type = value.get('type', 'string')
if type == 'auto_archiver.utils.json_loader':
@@ -65,11 +81,14 @@ def generate_module_docs():
configs_cheatsheet += f"| `{module.name}.{key}` | {help} | {default} | {type} |\n"
readme_str += "\n## Configuration Options\n"
readme_str += "\n### YAML\n"
yaml_string = io.BytesIO()
yaml.dump({module.name: config_yaml}, yaml_string)
readme_str += f"```{{code}} yaml\n{yaml_string.getvalue().decode('utf-8')}\n```\n"
config_string = io.BytesIO()
yaml.dump({module.name: config_yaml}, config_string)
config_string = config_string.getvalue().decode('utf-8')
yaml_string = EXAMPLE_YAML.format(steps_str=steps_str, config_string=config_string)
readme_str += f"```{{code}} yaml\n{yaml_string}\n```\n"
if manifest['configs']:
readme_str += "\n### Command Line:\n"
readme_str += config_table
@@ -103,3 +122,7 @@ def generate_index(modules_by_type):
with open(SAVE_FOLDER / "module_list.md", "w") as f:
print("writing", SAVE_FOLDER / "module_list.md")
f.write(readme_str)
if __name__ == "__main__":
generate_module_docs()

View File

@@ -77,3 +77,6 @@ html_theme = 'sphinx_book_theme'
html_static_path = ["../_static"]
html_css_files = ["custom.css"]
copybutton_prompt_text = r">>> |\.\.\."
copybutton_prompt_is_regexp = True
copybutton_only_copy_prompt_lines = False

View File

@@ -24,4 +24,5 @@ modules/extractor
modules/enricher
modules/storage
modules/database
modules/formatter
```

View File

@@ -45,3 +45,10 @@ The "archive location" link contains the path of the archived file, in local sto
![The archive result for a link in the demo sheet.](../demo-archive.png)
---
```{toctree}
:maxdepth: 1
:glob:
how_to/*
```

View File

@@ -0,0 +1,57 @@
# Authentication
The Authentication framework for auto-archiver allows you to add login details for various websites in a flexible way, directly from the configuration file.
There are two main use cases for authentication:
* Some websites require some kind of authentication in order to view the content. Examples include Facebook, Telegram etc.
* Some websites use anti-bot systems to block bot-like tools from accessig the website. Adding real login information to auto-archiver can sometimes bypass this.
## The Authentication Config
You can save your authentication information directly inside your orchestration config file, or as a separate file (for security/multi-deploy purposes). Whether storing your settings inside the orchestration file, or as a separate file, the configuration format is the same.
```{code} yaml
authentication:
# optional file to load authentication information from, for security or multi-system deploy purposes
load_from_file: path/to/authentication/file.txt
# optional setting to load cookies from the named browser on the system.
cookies_from_browser: firefox
# optional setting to load cookies from a cookies.txt/cookies.jar file. See note below on extracting these
cookies_file: path/to/cookies.jar
twitter.com,x.com:
username: myusername
password: 123
facebook.com:
cookie: single_cookie
othersite.com:
api_key: 123
api_secret: 1234
# All available options:
# - username: str - the username to use for login
# - password: str - the password to use for login
# - api_key: str - the API key to use for login
# - api_secret: str - the API secret to use for login
# - cookie: str - a cookie string to use for login (specific to this site)
```
### Recommendations for authentication
1. **Store authentication information separately:**
The authentication part of your configuration contains sensitive information. You should make efforts not to share this with others. For extra security, use the `load_from_file` option to keep your authentication settings out of your configuration file, ideally in a different folder.
2. **Don't use your own personal credentials**
Depending on the website you are extracting information from, there may be rules (Terms of Service) that prohibit you from scraping or extracting information using a bot. If you use your own personal account, there's a possibility it might get blocked/disabled. It's recommended to set up a separate, 'throwaway' account. In that way, if it gets blocked you can easily create another one to continue your archiving.
### How to create a cookies.jar or pass cookies directly to auto-archiver
auto-archiver uses yt-dlp's powerful cookies features under the hood. For instructions on how to extract a cookies.jar (or cookies.txt) file directly from your browser, see the FAQ in the [yt-dlp documentation](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp)
```{note} For developers:
For information on how to access and use authentication settings from within your module, see the `{generic_extractor}` for an example, or view the [`auth_for_site()` function in BaseModule](../autoapi/core/base_module/index.rst)
```

310
poetry.lock generated
View File

@@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
[[package]]
name = "authlib"
version = "1.4.0"
version = "1.4.1"
description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "Authlib-1.4.0-py2.py3-none-any.whl", hash = "sha256:4bb20b978c8b636222b549317c1815e1fe62234fc1c5efe8855d84aebf3a74e3"},
{file = "authlib-1.4.0.tar.gz", hash = "sha256:1c1e6608b5ed3624aeeee136ca7f8c120d6f51f731aa152b153d54741840e1f2"},
{file = "Authlib-1.4.1-py2.py3-none-any.whl", hash = "sha256:edc29c3f6a3e72cd9e9f45fff67fc663a2c364022eb0371c003f22d5405915c1"},
{file = "authlib-1.4.1.tar.gz", hash = "sha256:30ead9ea4993cdbab821dc6e01e818362f92da290c04c7f6a1940f86507a790d"},
]
[package.dependencies]
@@ -134,33 +134,34 @@ tomli = {version = "*", markers = "python_version < \"3.11\""}
[[package]]
name = "babel"
version = "2.16.0"
version = "2.17.0"
description = "Internationalization utilities"
optional = false
python-versions = ">=3.8"
groups = ["docs"]
files = [
{file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"},
{file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"},
{file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"},
{file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"},
]
[package.extras]
dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
dev = ["backports.zoneinfo", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata"]
[[package]]
name = "beautifulsoup4"
version = "4.12.3"
version = "4.13.3"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.6.0"
python-versions = ">=3.7.0"
groups = ["main", "docs"]
files = [
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
{file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
{file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
]
[package.dependencies]
soupsieve = ">1.2"
typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
@@ -171,18 +172,18 @@ lxml = ["lxml"]
[[package]]
name = "boto3"
version = "1.36.6"
version = "1.36.22"
description = "The AWS SDK for Python"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "boto3-1.36.6-py3-none-any.whl", hash = "sha256:6d473f0f340d02b4e9ad5b8e68786a09728101a8b950231b89ebdaf72b6dca21"},
{file = "boto3-1.36.6.tar.gz", hash = "sha256:b36feae061dc0793cf311468956a0a9e99215ce38bc99a1a4e55a5b105f16297"},
{file = "boto3-1.36.22-py3-none-any.whl", hash = "sha256:39957eabdce009353d72d131046489fbbfa15891865d5f069f1e8bfa414e6b81"},
{file = "boto3-1.36.22.tar.gz", hash = "sha256:768c8a4d4a6227fe2258105efa086f1424cba5ca915a5eb2305b2cd979306ad1"},
]
[package.dependencies]
botocore = ">=1.36.6,<1.37.0"
botocore = ">=1.36.22,<1.37.0"
jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.11.0,<0.12.0"
@@ -191,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
[[package]]
name = "botocore"
version = "1.36.6"
version = "1.36.22"
description = "Low-level, data-driven core of boto 3."
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "botocore-1.36.6-py3-none-any.whl", hash = "sha256:f77bbbb03fb420e260174650fb5c0cc142ec20a96967734eed2b0ef24334ef34"},
{file = "botocore-1.36.6.tar.gz", hash = "sha256:4864c53d638da191a34daf3ede3ff1371a3719d952cc0c6bd24ce2836a38dd77"},
{file = "botocore-1.36.22-py3-none-any.whl", hash = "sha256:75d6b34acb0686ee4d54ff6eb285e78ccfe318407428769d1e3e13351714d890"},
{file = "botocore-1.36.22.tar.gz", hash = "sha256:59520247d5a479731724f97c995d5a1c2aae3b303b324f39d99efcfad1d3019e"},
]
[package.dependencies]
@@ -207,7 +208,7 @@ python-dateutil = ">=2.1,<3.0.0"
urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}
[package.extras]
crt = ["awscrt (==0.23.4)"]
crt = ["awscrt (==0.23.8)"]
[[package]]
name = "brotli"
@@ -674,26 +675,26 @@ typing-inspect = ">=0.4.0,<1"
[[package]]
name = "dateparser"
version = "1.2.0"
version = "1.2.1"
description = "Date parsing library designed to parse dates from HTML pages"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "dateparser-1.2.0-py2.py3-none-any.whl", hash = "sha256:0b21ad96534e562920a0083e97fd45fa959882d4162acc358705144520a35830"},
{file = "dateparser-1.2.0.tar.gz", hash = "sha256:7975b43a4222283e0ae15be7b4999d08c9a70e2d378ac87385b1ccf2cffbbb30"},
{file = "dateparser-1.2.1-py3-none-any.whl", hash = "sha256:bdcac262a467e6260030040748ad7c10d6bacd4f3b9cdb4cfd2251939174508c"},
{file = "dateparser-1.2.1.tar.gz", hash = "sha256:7e4919aeb48481dbfc01ac9683c8e20bfe95bb715a38c1e9f6af889f4f30ccc3"},
]
[package.dependencies]
python-dateutil = "*"
pytz = "*"
regex = "<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27"
tzlocal = "*"
python-dateutil = ">=2.7.0"
pytz = ">=2024.2"
regex = ">=2015.06.24,<2019.02.19 || >2019.02.19,<2021.8.27 || >2021.8.27"
tzlocal = ">=0.2"
[package.extras]
calendars = ["convertdate", "hijri-converter"]
fasttext = ["fasttext"]
langdetect = ["langdetect"]
calendars = ["convertdate (>=2.2.1)", "hijridate"]
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
langdetect = ["langdetect (>=1.0.0)"]
[[package]]
name = "docutils"
@@ -755,14 +756,14 @@ files = [
[[package]]
name = "google-api-core"
version = "2.24.0"
version = "2.24.1"
description = "Google API client core library"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"},
{file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"},
{file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"},
{file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"},
]
[package.dependencies]
@@ -780,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
[[package]]
name = "google-api-python-client"
version = "2.159.0"
version = "2.161.0"
description = "Google API Client Library for Python"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"},
{file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"},
{file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"},
{file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"},
]
[package.dependencies]
@@ -859,14 +860,14 @@ tool = ["click (>=6.0.0)"]
[[package]]
name = "googleapis-common-protos"
version = "1.66.0"
version = "1.67.0"
description = "Common protobufs used in Google APIs"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"},
{file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"},
{file = "googleapis_common_protos-1.67.0-py2.py3-none-any.whl", hash = "sha256:579de760800d13616f51cf8be00c876f00a9f146d3e6510e19d1f4111758b741"},
{file = "googleapis_common_protos-1.67.0.tar.gz", hash = "sha256:21398025365f138be356d5923e9168737d94d46a72aefee4a6110a1f23463c86"},
]
[package.dependencies]
@@ -1158,14 +1159,14 @@ files = [
[[package]]
name = "marshmallow"
version = "3.26.0"
version = "3.26.1"
description = "A lightweight library for converting complex datatypes to and from native Python datatypes."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "marshmallow-3.26.0-py3-none-any.whl", hash = "sha256:1287bca04e6a5f4094822ac153c03da5e214a0a60bcd557b140f3e66991b8ca1"},
{file = "marshmallow-3.26.0.tar.gz", hash = "sha256:eb36762a1cc76d7abf831e18a3a1b26d3d481bbc74581b8e532a3d3a8115e1cb"},
{file = "marshmallow-3.26.1-py3-none-any.whl", hash = "sha256:3350409f20a70a7e4e11a27661187b77cdcaeb20abca41c1454fe33636bea09c"},
{file = "marshmallow-3.26.1.tar.gz", hash = "sha256:e6d8affb6cb61d39d26402096dc0aee12d5a26d490a121f118d2e81dc0719dc6"},
]
[package.dependencies]
@@ -1234,14 +1235,14 @@ files = [
[[package]]
name = "myst-parser"
version = "4.0.0"
version = "4.0.1"
description = "An extended [CommonMark](https://spec.commonmark.org/) compliant parser,"
optional = false
python-versions = ">=3.10"
groups = ["docs"]
files = [
{file = "myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d"},
{file = "myst_parser-4.0.0.tar.gz", hash = "sha256:851c9dfb44e36e56d15d05e72f02b80da21a9e0d07cba96baf5e2d476bb91531"},
{file = "myst_parser-4.0.1-py3-none-any.whl", hash = "sha256:9134e88959ec3b5780aedf8a99680ea242869d012e8821db3126d427edc9c95d"},
{file = "myst_parser-4.0.1.tar.gz", hash = "sha256:5cfea715e4f3574138aecbf7d54132296bfd72bb614d31168f48c477a830a7c4"},
]
[package.dependencies]
@@ -1253,10 +1254,10 @@ pyyaml = "*"
sphinx = ">=7,<9"
[package.extras]
code-style = ["pre-commit (>=3.0,<4.0)"]
code-style = ["pre-commit (>=4.0,<5.0)"]
linkify = ["linkify-it-py (>=2.0,<3.0)"]
rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-book-theme (>=1.1,<2.0)", "sphinx-copybutton", "sphinx-design", "sphinx-pyscript", "sphinx-tippy (>=0.4.3)", "sphinx-togglebutton", "sphinxext-opengraph (>=0.9.0,<0.10.0)", "sphinxext-rediraffe (>=0.2.7,<0.3.0)"]
testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"]
[[package]]
@@ -1530,14 +1531,14 @@ testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "proto-plus"
version = "1.25.0"
description = "Beautiful, Pythonic protocol buffers."
version = "1.26.0"
description = "Beautiful, Pythonic protocol buffers"
optional = false
python-versions = ">=3.7"
groups = ["main"]
files = [
{file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"},
{file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"},
{file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"},
{file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"},
]
[package.dependencies]
@@ -1814,6 +1815,24 @@ loguru = "*"
[package.extras]
test = ["pytest", "pytest-cov"]
[[package]]
name = "pytest-mock"
version = "3.14.0"
description = "Thin-wrapper around the mock package for easier use with pytest"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
{file = "pytest-mock-3.14.0.tar.gz", hash = "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"},
{file = "pytest_mock-3.14.0-py3-none-any.whl", hash = "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f"},
]
[package.dependencies]
pytest = ">=6.2.5"
[package.extras]
dev = ["pre-commit", "pytest-asyncio", "tox"]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
@@ -1866,14 +1885,14 @@ requests = ">=2.28"
[[package]]
name = "pytz"
version = "2024.2"
version = "2025.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"},
{file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"},
{file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"},
{file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
]
[[package]]
@@ -2122,14 +2141,14 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]]
name = "rich-argparse"
version = "1.6.0"
version = "1.7.0"
description = "Rich help formatters for argparse and optparse"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"},
{file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"},
{file = "rich_argparse-1.7.0-py3-none-any.whl", hash = "sha256:b8ec8943588e9731967f4f97b735b03dc127c416f480a083060433a97baf2fd3"},
{file = "rich_argparse-1.7.0.tar.gz", hash = "sha256:f31d809c465ee43f367d599ccaf88b73bc2c4d75d74ed43f2d538838c53544ba"},
]
[package.dependencies]
@@ -2362,24 +2381,24 @@ test = ["cython (>=3.0)", "defusedxml (>=0.7.1)", "pytest (>=8.0)", "setuptools
[[package]]
name = "sphinx-autoapi"
version = "3.4.0"
version = "3.6.0"
description = "Sphinx API documentation generator"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
groups = ["docs"]
files = [
{file = "sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92"},
{file = "sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c"},
{file = "sphinx_autoapi-3.6.0-py3-none-any.whl", hash = "sha256:f3b66714493cab140b0e896d33ce7137654a16ac1edb6563edcbd47bf975f711"},
{file = "sphinx_autoapi-3.6.0.tar.gz", hash = "sha256:c685f274e41d0842ae7e199460c322c4bd7fec816ccc2da8d806094b4f64af06"},
]
[package.dependencies]
astroid = [
{version = ">=2.7", markers = "python_version < \"3.12\""},
{version = ">=3.0.0a1", markers = "python_version >= \"3.12\""},
{version = ">=3", markers = "python_version >= \"3.12\""},
]
Jinja2 = "*"
PyYAML = "*"
sphinx = ">=6.1.0"
sphinx = ">=7.4.0"
[[package]]
name = "sphinx-autobuild"
@@ -2679,14 +2698,14 @@ telegram = ["requests"]
[[package]]
name = "trio"
version = "0.28.0"
version = "0.29.0"
description = "A friendly Python library for async concurrency and I/O"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "trio-0.28.0-py3-none-any.whl", hash = "sha256:56d58977acc1635735a96581ec70513cc781b8b6decd299c487d3be2a721cd94"},
{file = "trio-0.28.0.tar.gz", hash = "sha256:4e547896fe9e8a5658e54e4c7c5fa1db748cbbbaa7c965e7d40505b928c73c05"},
{file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"},
{file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"},
]
[package.dependencies]
@@ -2700,18 +2719,19 @@ sortedcontainers = "*"
[[package]]
name = "trio-websocket"
version = "0.11.1"
version = "0.12.1"
description = "WebSocket library for Trio"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "trio-websocket-0.11.1.tar.gz", hash = "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f"},
{file = "trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"},
{file = "trio_websocket-0.12.1-py3-none-any.whl", hash = "sha256:608ec746bb287e5d5a66baf483e41194193c5cf05ffaad6240e7d1fcd80d1e6f"},
{file = "trio_websocket-0.12.1.tar.gz", hash = "sha256:d55ccd4d3eae27c494f3fdae14823317839bdcb8214d1173eacc4d42c69fc91b"},
]
[package.dependencies]
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
outcome = ">=1.2.0"
trio = ">=0.11"
wsproto = ">=0.14"
@@ -2778,14 +2798,14 @@ files = [
[[package]]
name = "tzlocal"
version = "5.2"
version = "5.3"
description = "tzinfo object for the local timezone"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"},
{file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"},
{file = "tzlocal-5.3-py3-none-any.whl", hash = "sha256:3814135a1bb29763c6e4f08fd6e41dbb435c7a60bfbb03270211bcc537187d8c"},
{file = "tzlocal-5.3.tar.gz", hash = "sha256:2fafbfc07e9d8b49ade18f898d6bcd37ae88ce3ad6486842a2e4f03af68323d2"},
]
[package.dependencies]
@@ -3031,81 +3051,81 @@ test = ["websockets"]
[[package]]
name = "websockets"
version = "14.2"
version = "15.0"
description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
optional = false
python-versions = ">=3.9"
groups = ["main", "docs"]
files = [
{file = "websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885"},
{file = "websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397"},
{file = "websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610"},
{file = "websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3"},
{file = "websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980"},
{file = "websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8"},
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7"},
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f"},
{file = "websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d"},
{file = "websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d"},
{file = "websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2"},
{file = "websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166"},
{file = "websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f"},
{file = "websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910"},
{file = "websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c"},
{file = "websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473"},
{file = "websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473"},
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56"},
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142"},
{file = "websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d"},
{file = "websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a"},
{file = "websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b"},
{file = "websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c"},
{file = "websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967"},
{file = "websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990"},
{file = "websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda"},
{file = "websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95"},
{file = "websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3"},
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9"},
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267"},
{file = "websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe"},
{file = "websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205"},
{file = "websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce"},
{file = "websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e"},
{file = "websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad"},
{file = "websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03"},
{file = "websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f"},
{file = "websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5"},
{file = "websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a"},
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20"},
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2"},
{file = "websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307"},
{file = "websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc"},
{file = "websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f"},
{file = "websockets-14.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7cd5706caec1686c5d233bc76243ff64b1c0dc445339bd538f30547e787c11fe"},
{file = "websockets-14.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec607328ce95a2f12b595f7ae4c5d71bf502212bddcea528290b35c286932b12"},
{file = "websockets-14.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da85651270c6bfb630136423037dd4975199e5d4114cae6d3066641adcc9d1c7"},
{file = "websockets-14.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ecadc7ce90accf39903815697917643f5b7cfb73c96702318a096c00aa71f5"},
{file = "websockets-14.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1979bee04af6a78608024bad6dfcc0cc930ce819f9e10342a29a05b5320355d0"},
{file = "websockets-14.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dddacad58e2614a24938a50b85969d56f88e620e3f897b7d80ac0d8a5800258"},
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:89a71173caaf75fa71a09a5f614f450ba3ec84ad9fca47cb2422a860676716f0"},
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:6af6a4b26eea4fc06c6818a6b962a952441e0e39548b44773502761ded8cc1d4"},
{file = "websockets-14.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:80c8efa38957f20bba0117b48737993643204645e9ec45512579132508477cfc"},
{file = "websockets-14.2-cp39-cp39-win32.whl", hash = "sha256:2e20c5f517e2163d76e2729104abc42639c41cf91f7b1839295be43302713661"},
{file = "websockets-14.2-cp39-cp39-win_amd64.whl", hash = "sha256:b4c8cef610e8d7c70dea92e62b6814a8cd24fbd01d7103cc89308d2bfe1659ef"},
{file = "websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29"},
{file = "websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c"},
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2"},
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c"},
{file = "websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a"},
{file = "websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3"},
{file = "websockets-14.2-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:bbe03eb853e17fd5b15448328b4ec7fb2407d45fb0245036d06a3af251f8e48f"},
{file = "websockets-14.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a3c4aa3428b904d5404a0ed85f3644d37e2cb25996b7f096d77caeb0e96a3b42"},
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:577a4cebf1ceaf0b65ffc42c54856214165fb8ceeba3935852fc33f6b0c55e7f"},
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad1c1d02357b7665e700eca43a31d52814ad9ad9b89b58118bdabc365454b574"},
{file = "websockets-14.2-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f390024a47d904613577df83ba700bd189eedc09c57af0a904e5c39624621270"},
{file = "websockets-14.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3c1426c021c38cf92b453cdf371228d3430acd775edee6bac5a4d577efc72365"},
{file = "websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b"},
{file = "websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5"},
{file = "websockets-15.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5e6ee18a53dd5743e6155b8ff7e8e477c25b29b440f87f65be8165275c87fef0"},
{file = "websockets-15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ee06405ea2e67366a661ed313e14cf2a86e84142a3462852eb96348f7219cee3"},
{file = "websockets-15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8711682a629bbcaf492f5e0af72d378e976ea1d127a2d47584fa1c2c080b436b"},
{file = "websockets-15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94c4a9b01eede952442c088d415861b0cf2053cbd696b863f6d5022d4e4e2453"},
{file = "websockets-15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45535fead66e873f411c1d3cf0d3e175e66f4dd83c4f59d707d5b3e4c56541c4"},
{file = "websockets-15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb"},
{file = "websockets-15.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:67a04754d121ea5ca39ddedc3f77071651fb5b0bc6b973c71c515415b44ed9c5"},
{file = "websockets-15.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bd66b4865c8b853b8cca7379afb692fc7f52cf898786537dfb5e5e2d64f0a47f"},
{file = "websockets-15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a4cc73a6ae0a6751b76e69cece9d0311f054da9b22df6a12f2c53111735657c8"},
{file = "websockets-15.0-cp310-cp310-win32.whl", hash = "sha256:89da58e4005e153b03fe8b8794330e3f6a9774ee9e1c3bd5bc52eb098c3b0c4f"},
{file = "websockets-15.0-cp310-cp310-win_amd64.whl", hash = "sha256:4ff380aabd7a74a42a760ee76c68826a8f417ceb6ea415bd574a035a111fd133"},
{file = "websockets-15.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:dd24c4d256558429aeeb8d6c24ebad4e982ac52c50bc3670ae8646c181263965"},
{file = "websockets-15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f83eca8cbfd168e424dfa3b3b5c955d6c281e8fc09feb9d870886ff8d03683c7"},
{file = "websockets-15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4095a1f2093002c2208becf6f9a178b336b7572512ee0a1179731acb7788e8ad"},
{file = "websockets-15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb915101dfbf318486364ce85662bb7b020840f68138014972c08331458d41f3"},
{file = "websockets-15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45d464622314973d78f364689d5dbb9144e559f93dca11b11af3f2480b5034e1"},
{file = "websockets-15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace960769d60037ca9625b4c578a6f28a14301bd2a1ff13bb00e824ac9f73e55"},
{file = "websockets-15.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c7cd4b1015d2f60dfe539ee6c95bc968d5d5fad92ab01bb5501a77393da4f596"},
{file = "websockets-15.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4f7290295794b5dec470867c7baa4a14182b9732603fd0caf2a5bf1dc3ccabf3"},
{file = "websockets-15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3abd670ca7ce230d5a624fd3d55e055215d8d9b723adee0a348352f5d8d12ff4"},
{file = "websockets-15.0-cp311-cp311-win32.whl", hash = "sha256:110a847085246ab8d4d119632145224d6b49e406c64f1bbeed45c6f05097b680"},
{file = "websockets-15.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7bbbe2cd6ed80aceef2a14e9f1c1b61683194c216472ed5ff33b700e784e37"},
{file = "websockets-15.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cccc18077acd34c8072578394ec79563664b1c205f7a86a62e94fafc7b59001f"},
{file = "websockets-15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4c22992e24f12de340ca5f824121a5b3e1a37ad4360b4e1aaf15e9d1c42582d"},
{file = "websockets-15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1206432cc6c644f6fc03374b264c5ff805d980311563202ed7fef91a38906276"},
{file = "websockets-15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d3cc75ef3e17490042c47e0523aee1bcc4eacd2482796107fd59dd1100a44bc"},
{file = "websockets-15.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b89504227a5311610e4be16071465885a0a3d6b0e82e305ef46d9b064ce5fb72"},
{file = "websockets-15.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56e3efe356416bc67a8e093607315951d76910f03d2b3ad49c4ade9207bf710d"},
{file = "websockets-15.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f2205cdb444a42a7919690238fb5979a05439b9dbb73dd47c863d39640d85ab"},
{file = "websockets-15.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aea01f40995fa0945c020228ab919b8dfc93fc8a9f2d3d705ab5b793f32d9e99"},
{file = "websockets-15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a9f8e33747b1332db11cf7fcf4a9512bef9748cb5eb4d3f7fbc8c30d75dc6ffc"},
{file = "websockets-15.0-cp312-cp312-win32.whl", hash = "sha256:32e02a2d83f4954aa8c17e03fe8ec6962432c39aca4be7e8ee346b05a3476904"},
{file = "websockets-15.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc02b159b65c05f2ed9ec176b715b66918a674bd4daed48a9a7a590dd4be1aa"},
{file = "websockets-15.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d2244d8ab24374bed366f9ff206e2619345f9cd7fe79aad5225f53faac28b6b1"},
{file = "websockets-15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3a302241fbe825a3e4fe07666a2ab513edfdc6d43ce24b79691b45115273b5e7"},
{file = "websockets-15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:10552fed076757a70ba2c18edcbc601c7637b30cdfe8c24b65171e824c7d6081"},
{file = "websockets-15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c53f97032b87a406044a1c33d1e9290cc38b117a8062e8a8b285175d7e2f99c9"},
{file = "websockets-15.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1caf951110ca757b8ad9c4974f5cac7b8413004d2f29707e4d03a65d54cedf2b"},
{file = "websockets-15.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bf1ab71f9f23b0a1d52ec1682a3907e0c208c12fef9c3e99d2b80166b17905f"},
{file = "websockets-15.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bfcd3acc1a81f106abac6afd42327d2cf1e77ec905ae11dc1d9142a006a496b6"},
{file = "websockets-15.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c8c5c8e1bac05ef3c23722e591ef4f688f528235e2480f157a9cfe0a19081375"},
{file = "websockets-15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:86bfb52a9cfbcc09aba2b71388b0a20ea5c52b6517c0b2e316222435a8cdab72"},
{file = "websockets-15.0-cp313-cp313-win32.whl", hash = "sha256:26ba70fed190708551c19a360f9d7eca8e8c0f615d19a574292b7229e0ae324c"},
{file = "websockets-15.0-cp313-cp313-win_amd64.whl", hash = "sha256:ae721bcc8e69846af00b7a77a220614d9b2ec57d25017a6bbde3a99473e41ce8"},
{file = "websockets-15.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c348abc5924caa02a62896300e32ea80a81521f91d6db2e853e6b1994017c9f6"},
{file = "websockets-15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5294fcb410ed0a45d5d1cdedc4e51a60aab5b2b3193999028ea94afc2f554b05"},
{file = "websockets-15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c24ba103ecf45861e2e1f933d40b2d93f5d52d8228870c3e7bf1299cd1cb8ff1"},
{file = "websockets-15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc8821a03bcfb36e4e4705316f6b66af28450357af8a575dc8f4b09bf02a3dee"},
{file = "websockets-15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffc5ae23ada6515f31604f700009e2df90b091b67d463a8401c1d8a37f76c1d7"},
{file = "websockets-15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ac67b542505186b3bbdaffbc303292e1ee9c8729e5d5df243c1f20f4bb9057e"},
{file = "websockets-15.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c86dc2068f1c5ca2065aca34f257bbf4f78caf566eb230f692ad347da191f0a1"},
{file = "websockets-15.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:30cff3ef329682b6182c01c568f551481774c476722020b8f7d0daacbed07a17"},
{file = "websockets-15.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:98dcf978d4c6048965d1762abd534c9d53bae981a035bfe486690ba11f49bbbb"},
{file = "websockets-15.0-cp39-cp39-win32.whl", hash = "sha256:37d66646f929ae7c22c79bc73ec4074d6db45e6384500ee3e0d476daf55482a9"},
{file = "websockets-15.0-cp39-cp39-win_amd64.whl", hash = "sha256:24d5333a9b2343330f0f4eb88546e2c32a7f5c280f8dd7d3cc079beb0901781b"},
{file = "websockets-15.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b499caef4bca9cbd0bd23cd3386f5113ee7378094a3cb613a2fa543260fe9506"},
{file = "websockets-15.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:17f2854c6bd9ee008c4b270f7010fe2da6c16eac5724a175e75010aacd905b31"},
{file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89f72524033abbfde880ad338fd3c2c16e31ae232323ebdfbc745cbb1b3dcc03"},
{file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1657a9eecb29d7838e3b415458cc494e6d1b194f7ac73a34aa55c6fb6c72d1f3"},
{file = "websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e413352a921f5ad5d66f9e2869b977e88d5103fc528b6deb8423028a2befd842"},
{file = "websockets-15.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8561c48b0090993e3b2a54db480cab1d23eb2c5735067213bb90f402806339f5"},
{file = "websockets-15.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:190bc6ef8690cd88232a038d1b15714c258f79653abad62f7048249b09438af3"},
{file = "websockets-15.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:327adab7671f3726b0ba69be9e865bba23b37a605b585e65895c428f6e47e766"},
{file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd8ef197c87afe0a9009f7a28b5dc613bfc585d329f80b7af404e766aa9e8c7"},
{file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:789c43bf4a10cd067c24c321238e800b8b2716c863ddb2294d2fed886fa5a689"},
{file = "websockets-15.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7394c0b7d460569c9285fa089a429f58465db930012566c03046f9e3ab0ed181"},
{file = "websockets-15.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ea4f210422b912ebe58ef0ad33088bc8e5c5ff9655a8822500690abc3b1232d"},
{file = "websockets-15.0-py3-none-any.whl", hash = "sha256:51ffd53c53c4442415b613497a34ba0aa7b99ac07f1e4a62db5dcd640ae6c3c3"},
{file = "websockets-15.0.tar.gz", hash = "sha256:ca36151289a15b39d8d683fd8b7abbe26fc50be311066c5f8dcf3cb8cee107ab"},
]
[[package]]
@@ -3164,4 +3184,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.13"
content-hash = "b3a6142d6495bc4c8741e9411d29352af219851e4b84b263f991e1bb6db1614e"
content-hash = "2d0a953383901fe12e97f6f56a76a9d8008788695425792eedbf739a18585188"

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "0.13.2"
version = "0.13.4"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"
@@ -63,6 +63,7 @@ dependencies = [
pytest = "^8.3.4"
autopep8 = "^2.3.1"
pytest-loguru = "^0.4.0"
pytest-mock = "^3.14.0"
[tool.poetry.group.docs.dependencies]
sphinx = "^8.1.3"

View File

@@ -3,7 +3,7 @@ from auto_archiver.core.orchestrator import ArchivingOrchestrator
import sys
def main():
ArchivingOrchestrator().run(sys.argv[1:])
for _ in ArchivingOrchestrator()._command_line_run(sys.argv[1:]): pass
if __name__ == "__main__":
main()

View File

@@ -3,7 +3,7 @@
"""
from .metadata import Metadata
from .media import Media
from .module import BaseModule
from .base_module import BaseModule
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

View File

@@ -1,13 +1,18 @@
from urllib.parse import urlparse
from typing import Mapping, Any
from __future__ import annotations
from typing import Mapping, Any, Type, TYPE_CHECKING
from abc import ABC
from copy import deepcopy, copy
from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .module import ModuleFactory
class BaseModule(ABC):
"""
@@ -17,41 +22,24 @@ class BaseModule(ABC):
however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that
See consts.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the
like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
default manifest structure.
"""
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
MODULE_TYPES = CONF_MODULE_TYPES
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]]
name: str
module_factory: ModuleFactory
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None
@@ -63,12 +51,6 @@ class BaseModule(ABC):
def config_setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):
authentication[site] = val
del authentication[key]
# this is important. Each instance is given its own deepcopied config, so modules cannot
# change values to affect other modules
@@ -89,16 +71,21 @@ class BaseModule(ABC):
Returns the authentication information for a given site. This is used to authenticate
with a site before extracting data. The site should be the domain of the site, e.g. 'twitter.com'
extract_cookies: bool - whether or not to extract cookies from the given browser and return the
cookie jar (disabling can speed up) processing if you don't actually need the cookies jar
:param site: the domain of the site to get authentication information for
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
Currently, the dict can have keys of the following types:
- username: str - the username to use for login
- password: str - the password to use for login
- api_key: str - the API key to use for login
- api_secret: str - the API secret to use for login
- cookie: str - a cookie string to use for login (specific to this site)
- cookies_jar: YoutubeDLCookieJar | http.cookiejar.MozillaCookieJar - a cookie jar compatible with requests (e.g. `requests.get(cookies=cookie_jar)`)
:returns: authdict dict of login information for the given site
**Global options:**\n
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
* cookies_file: str - the path to a cookies file to use for login\n
**Currently, the sites dict can have keys of the following types:**\n
* username: str - the username to use for login\n
* password: str - the password to use for login\n
* api_key: str - the API key to use for login\n
* api_secret: str - the API secret to use for login\n
* cookie: str - a cookie string to use for login (specific to this site)\n
"""
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

View File

@@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger
from copy import deepcopy
from .module import BaseModule
from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple
@@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \
steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
"""
# Global configuration
@@ -129,6 +129,11 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
continue
if key == 'steps':
for module_type, modules in value.items():
# overwrite the 'steps' from the config file with the ones from the CLI
yaml_subdict[key][module_type] = modules
if is_dict_type(value):
update_dict(value, yaml_subdict[key])
elif is_list_type(value):
@@ -137,7 +142,6 @@ def merge_dicts(dotdict: dict, yaml_dict: CommentedMap) -> CommentedMap:
yaml_subdict[key] = value
update_dict(from_dot_notation(dotdict), yaml_dict)
return yaml_dict
def read_yaml(yaml_filename: str) -> CommentedMap:
@@ -159,6 +163,11 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
auth_dict = config_to_save.get("authentication", {})
if auth_dict and auth_dict.get('load_from_file'):
# remove all other values from the config, don't want to store it in the config file
auth_dict = {"load_from_file": auth_dict["load_from_file"]}
config_to_save.pop('urls', None)
with open(yaml_filename, "w", encoding="utf-8") as outf:
_yaml.dump(config_to_save, outf)

View File

@@ -0,0 +1,23 @@
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

View File

@@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations
from dataclasses import dataclass
from typing import List
from typing import List, TYPE_CHECKING
import shutil
import ast
import copy
@@ -16,99 +16,113 @@ import os
from os.path import join
from loguru import logger
import auto_archiver
from .base_module import BaseModule
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
_LAZY_LOADED_MODULES = {}
MANIFEST_FILE = "__manifest__.py"
if TYPE_CHECKING:
from .base_module import BaseModule
def setup_paths(paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
HAS_SETUP_PATHS = False
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
auto_archiver.modules.__path__.append(path)
class ModuleFactory:
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def __init__(self):
self._lazy_modules = {}
def get_module(module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return get_module_lazy(module_name).load(config)
def setup_paths(self, paths: list[str]) -> None:
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
global HAS_SETUP_PATHS
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
# see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__:
if HAS_SETUP_PATHS == True:
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return self.get_module_lazy(module_name).load(config)
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in self._lazy_modules:
return self._lazy_modules[module_name]
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
if _LAZY_LOADED_MODULES.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path)
_LAZY_LOADED_MODULES[possible_module] = lazy_module
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
if self._lazy_modules.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
return all_modules
self._lazy_modules[possible_module] = lazy_module
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
return all_modules
@dataclass
class LazyBaseModule:
@@ -123,14 +137,16 @@ class LazyBaseModule:
type: list
description: str
path: str
module_factory: ModuleFactory
_manifest: dict = None
_instance: BaseModule = None
_entry_point: str = None
def __init__(self, module_name, path):
def __init__(self, module_name, path, factory: ModuleFactory):
self.name = module_name
self.path = path
self.module_factory = factory
@property
def entry_point(self):
@@ -161,7 +177,7 @@ class LazyBaseModule:
return self._manifest
# print(f"Loading manifest for module {module_path}")
# load the manifest file
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST)
manifest = copy.deepcopy(DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f:
try:
@@ -189,13 +205,14 @@ class LazyBaseModule:
# clear out any empty strings that a user may have erroneously added
continue
if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1)
def check_python_dep(dep):
# first check if it's a module:
try:
m = get_module_lazy(dep, suppress_warnings=True)
m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
try:
# we must now load this module and set it up with the config
m.load(config)
@@ -230,19 +247,21 @@ class LazyBaseModule:
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
instance.name = self.name
if not getattr(instance, 'display_name', None):
instance.display_name = self.display_name
self._instance = instance
# set the name, display name and module factory
instance.name = self.name
instance.display_name = self.display_name
instance.module_factory = self.module_factory
# merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config)
instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance
def __repr__(self):

View File

@@ -5,9 +5,10 @@
"""
from __future__ import annotations
from typing import Generator, Union, List, Type
from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse
from ipaddress import ip_address
from copy import copy
import argparse
import os
import sys
@@ -21,12 +22,14 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media
from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule
from .consts import MODULE_TYPES
from loguru import logger
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml"
@@ -43,30 +46,50 @@ class AuthenticationJsonParseAction(JsonParseAction):
def __call__(self, parser, namespace, values, option_string=None):
super().__call__(parser, namespace, values, option_string)
auth_dict = getattr(namespace, self.dest)
if isinstance(auth_dict, str):
# if it's a string
def load_from_file(path):
try:
with open(auth_dict, 'r') as f:
with open(path, 'r') as f:
try:
auth_dict = json.load(f)
except json.JSONDecodeError:
f.seek(0)
# maybe it's yaml, try that
auth_dict = _yaml.load(f)
if auth_dict.get('authentication'):
auth_dict = auth_dict['authentication']
auth_dict['load_from_file'] = path
return auth_dict
except:
pass
return None
if isinstance(auth_dict, dict) and auth_dict.get('from_file'):
auth_dict = load_from_file(auth_dict['from_file'])
elif isinstance(auth_dict, str):
# if it's a string
auth_dict = load_from_file(auth_dict)
if not isinstance(auth_dict, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
for site, auth in auth_dict.items():
if not isinstance(site, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError("Authentication must be a dictionary of site names and their authentication methods")
global_options = ['cookies_from_browser', 'cookies_file', 'load_from_file']
for key, auth in auth_dict.items():
if key in global_options:
continue
if not isinstance(key, str) or not isinstance(auth, dict):
raise argparse.ArgumentTypeError(f"Authentication must be a dictionary of site names and their authentication methods. Valid global configs are {global_options}")
# extract out concatenated sites
for key, val in copy(auth_dict).items():
if "," in key:
for site in key.split(","):
auth_dict[site] = val
del auth_dict[key]
setattr(namespace, self.dest, auth_dict)
class UniqueAppendAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
if not hasattr(namespace, self.dest):
setattr(namespace, self.dest, [])
for value in values:
if value not in getattr(namespace, self.dest):
getattr(namespace, self.dest).append(value)
@@ -74,6 +97,12 @@ class UniqueAppendAction(argparse.Action):
class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
logger_id: int
# instance variables, used for convenience to access modules by step
feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]]
@@ -81,6 +110,11 @@ class ArchivingOrchestrator:
storages: List[Type[Storage]]
formatters: List[Type[Formatter]]
def __init__(self):
self.module_factory = ModuleFactory()
self.setup_finished = False
self.logger_id = None
def setup_basic_parser(self):
parser = argparse.ArgumentParser(
prog="auto-archiver",
@@ -104,36 +138,50 @@ class ArchivingOrchestrator:
return parser
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
# modules parser to get the overridden 'steps' values
modules_parser = argparse.ArgumentParser(
add_help=False,
)
self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser(
add_help=False,
)
self.add_additional_args(parser)
# merge command line module args (--feeders, --enrichers etc.) and add them to the config
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
# if full, we'll load all modules
# TODO: BUG** - basic_config won't have steps in it, since these args aren't added to 'basic_parser'
# but should we add them? Or should we just add them to the 'complete' parser?
if yaml_config != EMPTY_CONFIG:
# only load the modules enabled in config
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = []
# first loads the modules from the config file, then from the command line
for config in [yaml_config['steps'], basic_config.__dict__]:
for module_type in BaseModule.MODULE_TYPES:
enabled_modules.extend(config.get(f"{module_type}s", []))
for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_module_args(avail_modules, parser)
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup
yaml_config['steps']['feeders'] = ['cli_feeder']
if not yaml_config['steps']['feeders']:
yaml_config['steps']['feeders'] = ['cli_feeder']
# add them to the config
for module in simple_modules:
@@ -141,30 +189,38 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
self.add_individual_module_args(self.module_factory.available_modules(), parser)
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
parsed, unknown = parser.parse_known_args(unused_args)
# merge the new config with the old one
self.config = merge_dicts(vars(parsed), yaml_config)
config = merge_dicts(vars(parsed), yaml_config)
# clean out args from the base_parser that we don't want in the config
for key in vars(basic_config):
self.config.pop(key, None)
config.pop(key, None)
# setup the logging
self.setup_logging()
self.setup_logging(config)
if unknown:
logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?")
if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
if (config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(self.config, basic_config.config_file)
store_yaml(config, basic_config.config_file)
return self.config
return config
def add_modules_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
# Module loading from the command line
for module_type in MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None):
if not parser:
@@ -173,30 +229,24 @@ class ArchivingOrchestrator:
# allow passing URLs directly on the command line
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
parser.add_argument('--feeders', dest='steps.feeders', nargs='+', default=['cli_feeder'], help='the feeders to use', action=UniqueAppendAction)
parser.add_argument('--enrichers', dest='steps.enrichers', nargs='+', help='the enrichers to use', action=UniqueAppendAction)
parser.add_argument('--extractors', dest='steps.extractors', nargs='+', help='the extractors to use', action=UniqueAppendAction)
parser.add_argument('--databases', dest='steps.databases', nargs='+', help='the databases to use', action=UniqueAppendAction)
parser.add_argument('--storages', dest='steps.storages', nargs='+', help='the storages to use', action=UniqueAppendAction)
parser.add_argument('--formatters', dest='steps.formatters', nargs='+', help='the formatter to use', action=UniqueAppendAction)
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
(token, username etc.) that extractors can use to log into \
a website. If passing this on the command line, use a JSON string. \
You may also pass a path to a valid JSON/YAML file which will be parsed.',
default={},
nargs="?",
action=AuthenticationJsonParseAction)
# logging arguments
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules:
modules = available_modules(with_manifest=True)
module: LazyBaseModule
modules = self.module_factory.available_modules()
for module in modules:
if not module.configs:
@@ -226,21 +276,29 @@ class ArchivingOrchestrator:
arg.should_store = should_store
def show_help(self, basic_config: dict):
# for the help message, we want to load *all* possible modules and show the help
# for the help message, we want to load manifests from *all* possible modules and show their help/settings
# add configs as arg parser arguments
self.add_modules_args(self.basic_parser)
self.add_additional_args(self.basic_parser)
self.add_module_args(parser=self.basic_parser)
self.add_individual_module_args(parser=self.basic_parser)
self.basic_parser.print_help()
self.basic_parser.exit()
def setup_logging(self):
def setup_logging(self, config):
# setup loguru logging
logger.remove(0) # remove the default logger
logging_config = self.config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
try:
logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging']
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self, modules_by_type):
"""
@@ -250,7 +308,7 @@ class ArchivingOrchestrator:
"""
invalid_modules = []
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
step_items = []
modules_to_load = modules_by_type[f"{module_type}s"]
@@ -295,7 +353,7 @@ class ArchivingOrchestrator:
if module in invalid_modules:
continue
try:
loaded_module: BaseModule = get_module(module, self.config)
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module:
@@ -317,8 +375,13 @@ class ArchivingOrchestrator:
exit()
return read_yaml(config_file)
def setup_config(self, args: list) -> dict:
"""
Sets up the configuration file, merging the default config with the user's config
def run(self, args: list) -> Generator[Metadata]:
This function should only ever be run once.
"""
self.setup_basic_parser()
@@ -326,24 +389,56 @@ class ArchivingOrchestrator:
basic_config, unused_args = self.basic_parser.parse_known_args(args)
# setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths)
self.module_factory.setup_paths(basic_config.module_paths)
# if help flag was called, then show the help
if basic_config.help:
self.show_help(basic_config)
# merge command line --feeder etc. args with what's in the yaml config
yaml_config = self.load_config(basic_config.config_file)
self.setup_complete_parser(basic_config, yaml_config, unused_args)
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
def setup(self, args: list):
"""
Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
"""
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps'])
# log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES:
for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
for result in self.feed():
yield result
def _command_line_run(self, args: list) -> Generator[Metadata]:
"""
This is the main entry point for the orchestrator, when run from the command line.
:param args: list of arguments to pass to the orchestrator - these are the command line args
You should not call this method from code implementations.
This method sets up the configuration, loads the modules, and runs the feed.
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
To test configurations, without loading any modules you can also first call 'setup_configs'
"""
self.setup(args)
return self.feed()
def cleanup(self) -> None:
logger.info("Cleaning up")
@@ -351,7 +446,7 @@ class ArchivingOrchestrator:
e.cleanup()
def feed(self) -> Generator[Metadata]:
url_count = 0
for feeder in self.feeders:
for item in feeder:

View File

@@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule):
"""
@@ -74,7 +74,7 @@ class Storage(BaseModule):
filename = random_str(24)
elif filename_generator == "static":
# load the hash_enricher module
he = get_module(HashEnricher, self.config)
he = self.module_factory.get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:

View File

@@ -1 +1 @@
from atlos_db import AtlosDb
from .atlos_db import AtlosDb

View File

@@ -0,0 +1 @@
from .atlos_storage import AtlosStorage

View File

@@ -280,7 +280,7 @@ class GenericExtractor(Extractor):
# set up auth
auth = self.auth_for_site(url, extract_cookies=False)
# order of importance: username/pasword -> api_key -> cookie -> cookie_from_browser -> cookies_file
# order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
if auth:
if 'username' in auth and 'password' in auth:
logger.debug(f'Using provided auth username and password for {url}')
@@ -289,7 +289,7 @@ class GenericExtractor(Extractor):
elif 'cookie' in auth:
logger.debug(f'Using provided auth cookie for {url}')
yt_dlp.utils.std_headers['cookie'] = auth['cookie']
elif 'cookie_from_browser' in auth:
elif 'cookies_from_browser' in auth:
logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
elif 'cookies_file' in auth:

View File

@@ -10,7 +10,7 @@
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {
"default": None,
"help": "(alternative to sheet name) the id of the sheet to archive",
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
},
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
"service_account": {

View File

@@ -10,7 +10,6 @@ from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
class HtmlFormatter(Formatter):
environment: Environment = None
@@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
final_media = Media(filename=html_path, _mimetype="text/html")
# get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config)
he = self.module_factory.get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}")

View File

@@ -200,7 +200,7 @@
el.innerHTML = decodeCertificate(certificate);
let cyberChefUrl =
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate)}`;
`https://gchq.github.io/CyberChef/#recipe=Parse_X.509_certificate('PEM')&input=${btoa(certificate).replace(/=+$/, '')}`;
// create a new anchor with this url and append after the code
let a = document.createElement("a");
a.href = cyberChefUrl;

View File

@@ -77,13 +77,14 @@ class InstagramTbotExtractor(Extractor):
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
# This may be outdated and replaced by the below message, but keeping until confirmed
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if "Media not found or unavailable" in message:
logger.debug(f"No media found for link {url=} for {self.name}: {message}")
return False
if message:
result.set_content(message).set_title(message[:128])

View File

@@ -4,7 +4,6 @@
"requires_setup": True,
"dependencies": {
"python": ["loguru", "selenium"],
"bin": ["geckodriver"]
},
"configs": {
"width": {"default": 1280, "help": "width of the screenshots"},

View File

@@ -11,6 +11,10 @@ from auto_archiver.core import Media, Metadata
class ScreenshotEnricher(Enricher):
def __init__(self, webdriver_factory=None):
super().__init__()
self.webdriver_factory = webdriver_factory or Webdriver
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
@@ -20,7 +24,8 @@ class ScreenshotEnricher(Enricher):
logger.debug(f"Enriching screenshot for {url=}")
auth = self.auth_for_site(url)
with Webdriver(self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
with self.webdriver_factory(
self.width, self.height, self.timeout, facebook_accept_cookies='facebook.com' in url,
http_proxy=self.http_proxy, print_options=self.print_options, auth=auth) as driver:
try:
driver.get(url)
@@ -38,3 +43,4 @@ class ScreenshotEnricher(Enricher):
logger.info("TimeoutException loading page for screenshot")
except Exception as e:
logger.error(f"Got error while loading webdriver for screenshot enricher: {e}")

View File

@@ -7,8 +7,12 @@
"bin": ["ffmpeg"]
},
"configs": {
"thumbnails_per_minute": {"default": 60, "help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16, "help": "limit the number of thumbnails to generate per video, 0 means no limit"},
"thumbnails_per_minute": {"default": 60,
"type": "int",
"help": "how many thumbnails to generate per minute of video, can be limited by max_thumbnails"},
"max_thumbnails": {"default": 16,
"type": "int",
"help": "limit the number of thumbnails to generate per video, 0 means no limit"},
},
"description": """
Generates thumbnails for video files to provide visual previews.

View File

@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
logger.error(f"error getting duration of video {m.filename}: {e}")
return
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
num_thumbs = int(min(max(1, (duration / 60) * self.thumbnails_per_minute), self.max_thumbnails))
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
thumbnails_media = []

View File

@@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher):
"""
@@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
def setup(self) -> None:
self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config)
self.s3 = self.module_factory.get_module("s3_storage", self.config)
if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return
@@ -29,8 +28,7 @@ class WhisperEnricher(Enricher):
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
# TODO: this used to pass all storage items to store now
# Now only passing S3, the rest will get added later in the usual order (?)
# Only storing S3, the rest will get added later in the usual order (?)
m.store(url=url, metadata=to_enrich, storages=[self.s3])
try:
job_id = self.submit_job(m)

View File

@@ -46,7 +46,7 @@ def dump_payload(p):
def update_nested_dict(dictionary, update_dict):
# takes 2 dicts and overwrites the first with the second only on the changed balues
# takes 2 dicts and overwrites the first with the second only on the changed values
for key, value in update_dict.items():
if key in dictionary and isinstance(value, dict) and isinstance(dictionary[key], dict):
update_nested_dict(dictionary[key], value)

View File

@@ -3,12 +3,14 @@ pytest conftest file, for shared fixtures and configuration
"""
import os
import pickle
from datetime import datetime, timezone
from tempfile import TemporaryDirectory
from typing import Dict, Tuple
import hashlib
import pytest
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES
from auto_archiver.core.module import ModuleFactory
# Test names inserted into this list will be run last. This is useful for expensive/costly tests
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
@@ -20,19 +22,19 @@ TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
def setup_module(request):
def _setup_module(module_name, config={}):
module_factory = ModuleFactory()
if isinstance(module_name, type):
# get the module name:
# if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2]
m = get_module(module_name, {module_name: config})
m = module_factory.get_module(module_name, {module_name: config})
# add the tmp_dir to the module
tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name
def cleanup():
_LAZY_LOADED_MODULES.pop(module_name)
tmp_dir.cleanup()
request.addfinalizer(cleanup)
@@ -122,10 +124,36 @@ def pytest_runtest_setup(item):
def unpickle():
"""
Returns a helper function that unpickles a file
** gets the file from the test_files directory: tests/data/test_files **
** gets the file from the test_files directory: tests/data/ **
"""
def _unpickle(path):
test_data_dir = os.path.join(os.path.dirname(__file__), "data", "test_files")
with open(os.path.join(test_data_dir, path), "rb") as f:
with open(os.path.join("tests/data", path), "rb") as f:
return pickle.load(f)
return _unpickle
return _unpickle
@pytest.fixture
def mock_binary_dependencies(mocker):
mock_shutil_which = mocker.patch("shutil.which")
# Mock all binary dependencies as available
mock_shutil_which.return_value = "/usr/bin/fake_binary"
return mock_shutil_which
@pytest.fixture
def sample_datetime():
return datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)
@pytest.fixture(autouse=True)
def mock_sleep(mocker):
"""Globally mock time.sleep to avoid delays."""
return mocker.patch("time.sleep")
@pytest.fixture
def metadata():
metadata = Metadata()
metadata.set("_processed_at", "2021-01-01T00:00:00")
metadata.set_url("https://example.com")
return metadata

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,14 @@
{
"type": "service_account",
"project_id": "some-project-id",
"private_key_id": "some-private-key-id",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDPlcaFJgt7HzoC\n4z0b18PzI2R5c892mLnNwRO8DOKid5INt6z5RAWKDPdnIyHjRBx74qNZl6768pia\nztQNgnud7mKcmvOvGrpUbFx2BdAw8xTyAlRVMalOBhUS9RKvjP5WgSwR5EKwfvzy\nrGioC6ml/segz5EchSaIzgASwB17ir0w6IrymBxUeNelfzCGJpCRhqG5nG+eEjct\nUYU0QIyihRD1Lq0f3Z3D0xfTLLZ630iFBj/Wr0BCJHkl6hdVuGhnyn4S98sMX1Bd\ntaJF/lWi4jdt7SoXD3+FWv66kHPpFfINMpReuB9u0ogfYkORgiRBOMhYBkGGQjUG\nOnBTxEc3AgMBAAECgf9bKiK8DdSz0ALzQbRLhgj2B9485jHI49wjgINOyceZ23uS\nQYXaO+DFLcgLqBkVSGanuHMpU0+qCpeM0v9yXSTIW8RguWMnFd8ID/yLRktxfQa1\n1FAQh+NlF4/gnuUoM8N/FYSy6R5grfaxwU8Qfg66IQXUB52OezSVu5lxNO4G5Rwv\nJ2e/+XYBUv/H26BnQSmjFCzbJkdbtrOeThpaLwLexKcollvoHKGyus0jpWg4C9Ez\n9EJaE+on4nd+cM1Vd+dWaHXoZ9Db9IvxPBqFJE8fynap7RDBeZK678OuCvQntrp4\nrTsE9hW8073Jhl/LbhfbDC0lhFR0JUHygVGE01ECgYEA+g+ddpGGY90yhhM76bTr\nkU6WwislMmfS0WDdLPemNgzLwCtkC2vsQgzg/egxqkVF5dJ9upiFhVgpYxY7ap9U\nSGFemb6T1ASl/1yeNhd0yc4PZFsJ29k+kNgSIlJYm9KDCIMqS1wPoXvFQhbMitOf\n/gLCPugxl67c+qg6nfuODTkCgYEA1IPngESOJnV8oa2WReWrO6+u6xb/OhqdmBzI\n5yq1z3f5gb98XESZR/rCH2vAOmHIJPn3XdZHsznOuxhZwGr1oztiRIurLmBlxQoL\n7tq0jDOUVSD2yeyQwKt5LaBH94P598FiauGxXM4raREWKtcNBGoOX1u1+kEBsoL4\ntf10Z+8CgYEA3QFkB+ECR8y91KW3NAzEjj5JG/8J9wyv1IGpuQ5/hhG1Gni/CSEv\nRAkh6QaIrpZe+ooYuQwIJhwPKBYEGW4MDZSRCYzYFnCtTY5L/j6o55sJG4cipX3R\nwC5XiKIC0mUxjhpvDP+miPBdHNYNnT0AkH1btEF/YzIW+Coq9GnZ2HECgYAOOpax\ne+WYpZ0mphy9qVcBtA2eJ/gGx+ltWeAJuk5aCcpm6Y9GDkHFFAETYX+JaSqhbysk\n2UgLs/8nf8XioEa6GyvFMyTPAh1OSBHseDBGgt2XpZFgi7pVbCW87FJlPCzsbcJN\nLbdWY2d8rWwyihuRBBjaQaW5j8ixTxuf88xreQKBgQCST4Fr8C5CkpakTA+KOost\nLOlziUBm0534mTg7dTcOE1H1+gxtqpXlXcJylpGz1lUXRlHCIutN5iPJcN5cxFES\nsP7wBd7BhficsMKDiWPm9XbP2zXVZu0ldUxA1mONMsS1P4p7i3Dh4uzrRDmSkTUL\njUpppYDumg3oM7wSJ6sTQA==\n-----END PRIVATE KEY-----",
"client_email": "some-email",
"client_id": "some-client-email",
"auth_uri": "https://example.com/o/oauth2/auth",
"token_uri": "https://oauth2.example.com/token",
"auth_provider_x509_cert_url": "https://www.example.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.example.com/robot/v1/metadata/x509/some-email",
"universe_domain": "example.com"
}

View File

@@ -0,0 +1,59 @@
import pytest
from auto_archiver.core import Metadata
from auto_archiver.modules.api_db import AAApiDb
@pytest.fixture
def api_db(setup_module):
configs: dict = {
"api_endpoint": "https://api.example.com",
"api_token": "test-token",
"public": False,
"author_id": "Someone",
"group_id": "123",
"use_api_cache": True,
"store_results": True,
"tags": "[]",
}
return setup_module(AAApiDb, configs)
def test_fetch_no_cache(api_db, metadata):
# Test fetch
api_db.use_api_cache = False
assert api_db.fetch(metadata) is None
def test_fetch_fail_status(api_db, metadata, mocker):
# Test response fail in fetch method
mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get")
mock_get.return_value.status_code = 400
mock_get.return_value.json.return_value = {}
mock_error = mocker.patch("loguru.logger.error")
assert api_db.fetch(metadata) is False
mock_error.assert_called_once_with("AA API FAIL (400): {}")
def test_fetch(api_db, metadata, mocker):
# Test successful fetch method
mock_get = mocker.patch("auto_archiver.modules.api_db.api_db.requests.get")
mock_datetime = mocker.patch("auto_archiver.core.metadata.datetime.datetime")
mock_datetime.now.return_value = "2021-01-01T00:00:00"
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = [{"result": {}}, {"result":
{'media': [], 'metadata': {'_processed_at': '2021-01-01T00:00:00', 'url': 'https://example.com'},
'status': 'no archiver'}}]
assert api_db.fetch(metadata) == metadata
def test_done_success(api_db, metadata, mocker):
mock_post = mocker.patch("auto_archiver.modules.api_db.api_db.requests.post")
mock_post.return_value.status_code = 201
api_db.done(metadata)
mock_post.assert_called_once()
mock_post.assert_called_once_with("https://api.example.com/interop/submit-archive",
json={'author_id': 'Someone', 'url': 'https://example.com',
'public': False, 'group_id': '123', 'tags': ['[', ']'], 'result': '{"status": "no archiver", "metadata": {"_processed_at": "2021-01-01T00:00:00", "url": "https://example.com"}, "media": []}'},
headers={'Authorization': 'Bearer test-token'})

View File

@@ -0,0 +1,110 @@
import pytest
from datetime import datetime
from auto_archiver.core import Metadata
from auto_archiver.modules.atlos_db import AtlosDb
class FakeAPIResponse:
"""Simulate a response object."""
def __init__(self, data: dict, raise_error: bool = False) -> None:
self._data = data
self.raise_error = raise_error
def raise_for_status(self) -> None:
if self.raise_error:
raise Exception("HTTP error")
@pytest.fixture
def atlos_db(setup_module) -> AtlosDb:
"""Fixture for AtlosDb."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_db", configs)
def test_failed_no_atlos_id(atlos_db, metadata, mocker):
"""Test failed() skips posting when no atlos_id present."""
post_mock = mocker.patch("requests.post")
atlos_db.failed(metadata, "failure reason")
post_mock.assert_not_called()
def test_failed_with_atlos_id(atlos_db, metadata, mocker):
"""Test failed() posts failure when atlos_id is present."""
metadata.set("atlos_id", 42)
fake_resp = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_resp)
atlos_db.failed(metadata, "failure reason")
expected_url = (
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/42/auto_archiver"
)
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
expected_json = {
"metadata": {"processed": True, "status": "error", "error": "failure reason"}
}
post_mock.assert_called_once_with(
expected_url, headers=expected_headers, json=expected_json
)
def test_failed_http_error(atlos_db, metadata, mocker):
"""Test failed() raises exception on HTTP error."""
metadata.set("atlos_id", 42)
fake_resp = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_resp)
with pytest.raises(Exception, match="HTTP error"):
atlos_db.failed(metadata, "failure reason")
def test_fetch_returns_false(atlos_db):
"""Test fetch() always returns False."""
item = Metadata()
assert atlos_db.fetch(item) is False
def test_done_no_atlos_id(atlos_db, mocker):
"""Test done() skips posting when no atlos_id present."""
item = Metadata().set_url("http://example.com")
post_mock = mocker.patch("requests.post")
atlos_db.done(item)
post_mock.assert_not_called()
def test_done_with_atlos_id(atlos_db, metadata, mocker):
"""Test done() posts success when atlos_id is present."""
metadata.set("atlos_id", 99)
now = datetime.now()
metadata.set("timestamp", now)
fake_resp = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_resp)
atlos_db.done(metadata)
expected_url = (
f"{atlos_db.atlos_url}/api/v2/source_material/metadata/99/auto_archiver"
)
expected_headers = {"Authorization": f"Bearer {atlos_db.api_token}"}
expected_results = metadata.metadata.copy()
expected_results["timestamp"] = now.isoformat()
expected_json = {
"metadata": {
"processed": True,
"status": "success",
"results": expected_results,
}
}
post_mock.assert_called_once_with(
expected_url, headers=expected_headers, json=expected_json
)
def test_done_http_error(atlos_db, metadata, mocker):
"""Test done() raises exception on HTTP error."""
metadata.set("atlos_id", 123)
fake_resp = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_resp)
with pytest.raises(Exception, match="HTTP error"):
atlos_db.done(metadata)

View File

@@ -1,6 +1,4 @@
from datetime import datetime, timezone
from unittest.mock import MagicMock, patch
import pytest
from auto_archiver.core import Metadata, Media
@@ -9,8 +7,8 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet
@pytest.fixture
def mock_gworksheet():
mock_gworksheet = MagicMock(spec=GWorksheet)
def mock_gworksheet(mocker):
mock_gworksheet = mocker.MagicMock(spec=GWorksheet)
mock_gworksheet.col_exists.return_value = True
mock_gworksheet.get_cell.return_value = ""
mock_gworksheet.get_row.return_value = {}
@@ -18,14 +16,14 @@ def mock_gworksheet():
@pytest.fixture
def mock_metadata():
metadata: Metadata = MagicMock(spec=Metadata)
def mock_metadata(mocker):
metadata: Metadata = mocker.MagicMock(spec=Metadata)
metadata.get_url.return_value = "http://example.com"
metadata.status = "done"
metadata.get_title.return_value = "Example Title"
metadata.get.return_value = "Example Content"
metadata.get_timestamp.return_value = "2025-01-01T00:00:00"
metadata.get_final_media.return_value = MagicMock(spec=Media)
metadata.get_final_media.return_value = mocker.MagicMock(spec=Media)
metadata.get_all_media.return_value = []
metadata.get_media_by_id.return_value = None
metadata.get_first_image.return_value = None
@@ -47,21 +45,21 @@ def metadata():
@pytest.fixture
def mock_media():
def mock_media(mocker):
"""Fixture for a mock Media object."""
mock_media = MagicMock(spec=Media)
mock_media = mocker.MagicMock(spec=Media)
mock_media.urls = ["http://example.com/media"]
mock_media.get.return_value = "not-calculated"
return mock_media
@pytest.fixture
def gsheets_db(mock_gworksheet, setup_module):
def gsheets_db(mock_gworksheet, setup_module, mocker):
db = setup_module("gsheet_db", {
"allow_worksheets": "set()",
"block_worksheets": "set()",
"use_sheet_names_in_stored_paths": "True",
})
db._retrieve_gsheet = MagicMock(return_value=(mock_gworksheet, 1))
db._retrieve_gsheet = mocker.MagicMock(return_value=(mock_gworksheet, 1))
return db
@@ -109,27 +107,26 @@ def test_aborted(gsheets_db, mock_metadata, mock_gworksheet):
mock_gworksheet.set_cell.assert_called_once_with(1, 'status', '')
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls):
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata)
def test_done(gsheets_db, metadata, mock_gworksheet, expected_calls, mocker):
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
gsheets_db.done(metadata)
mock_gworksheet.batch_set_cell.assert_called_once_with(expected_calls)
def test_done_cached(gsheets_db, metadata, mock_gworksheet):
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata, cached=True)
def test_done_cached(gsheets_db, metadata, mock_gworksheet, mocker):
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
gsheets_db.done(metadata, cached=True)
# Verify the status message includes "[cached]"
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
assert any(call[2].startswith("[cached]") for call in call_args)
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet):
def test_done_missing_media(gsheets_db, metadata, mock_gworksheet, mocker):
# clear media from metadata
metadata.media = []
with patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp",
return_value='2025-02-01T00:00:00+00:00'):
gsheets_db.done(metadata)
mocker.patch("auto_archiver.modules.gsheet_db.gsheet_db.get_current_timestamp", return_value='2025-02-01T00:00:00+00:00')
gsheets_db.done(metadata)
# Verify nothing media-related gets updated
call_args = mock_gworksheet.batch_set_cell.call_args[0][0]
media_fields = {'archive', 'screenshot', 'thumbnail', 'wacz', 'replaywebpage'}

View File

@@ -2,7 +2,7 @@ import pytest
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module_lazy
from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
@@ -22,7 +22,7 @@ def test_default_config_values(setup_module):
def test_config():
# test default config
c = get_module_lazy('hash_enricher').configs
c = ModuleFactory().get_module_lazy('hash_enricher').configs
assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]

View File

@@ -1,6 +1,5 @@
import datetime
from datetime import datetime, timedelta, timezone
from unittest.mock import MagicMock, patch
import pytest
@@ -9,29 +8,21 @@ from auto_archiver.modules.meta_enricher import MetaEnricher
@pytest.fixture
def mock_metadata():
def mock_metadata(mocker):
"""Creates a mock Metadata object."""
mock: Metadata = MagicMock(spec=Metadata)
mock: Metadata = mocker.MagicMock(spec=Metadata)
mock.get_url.return_value = "https://example.com"
mock.is_empty.return_value = False # Default to not empty
mock.get_all_media.return_value = []
return mock
@pytest.fixture
def mock_media():
def mock_media(mocker):
"""Creates a mock Media object."""
mock: Media = MagicMock(spec=Media)
mock: Media = mocker.MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.set_title("Test Title")
m.set_content("Test Content")
return m
@pytest.fixture(autouse=True)
def meta_enricher(setup_module):
@@ -90,14 +81,14 @@ def test_enrich_file_sizes_no_media(meta_enricher, metadata):
assert metadata.get("total_size") == "0.0 bytes"
def test_enrich_archive_duration(meta_enricher, metadata):
def test_enrich_archive_duration(meta_enricher, metadata, mocker):
# Set fixed "processed at" time in the past
processed_at = datetime.now(timezone.utc) - timedelta(minutes=10, seconds=30)
metadata.set("_processed_at", processed_at)
# patch datetime
with patch("datetime.datetime") as mock_datetime:
mock_now = datetime.now(timezone.utc)
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
mock_datetime = mocker.patch("datetime.datetime")
mock_now = datetime.now(timezone.utc)
mock_datetime.now.return_value = mock_now
meta_enricher.enrich_archive_duration(metadata)
assert metadata.get("archive_duration_seconds") == 630

View File

@@ -0,0 +1,88 @@
import pytest
from auto_archiver.core import Media
@pytest.fixture
def mock_media(mocker):
"""Creates a mock Media object."""
mock: Media = mocker.MagicMock(spec=Media)
mock.filename = "mock_file.txt"
return mock
@pytest.fixture
def enricher(setup_module, mock_binary_dependencies):
return setup_module("metadata_enricher", {})
@pytest.mark.parametrize(
"output,expected",
[
("Key1: Value1\nKey2: Value2", {"Key1": "Value1", "Key2": "Value2"}),
("InvalidLine", {}),
("", {}),
],
)
def test_get_metadata(enricher, output, expected, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value.stdout = output
mock_run.return_value.stderr = ""
mock_run.return_value.returncode = 0
result = enricher.get_metadata("test.jpg")
assert result == expected
mock_run.assert_called_once_with(
["exiftool", "test.jpg"], capture_output=True, text=True
)
def test_get_metadata_exiftool_not_found(enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.side_effect = FileNotFoundError
result = enricher.get_metadata("test.jpg")
assert result == {}
def test_enrich_sets_metadata(enricher, mocker):
media1 = mocker.Mock(filename="img1.jpg")
media2 = mocker.Mock(filename="img2.jpg")
metadata = mocker.Mock()
metadata.media = [media1, media2]
enricher.get_metadata = lambda f: {"key": "value"} if f == "img1.jpg" else {}
enricher.enrich(metadata)
media1.set.assert_called_once_with("metadata", {"key": "value"})
media2.set.assert_not_called()
assert metadata.media == [media1, media2]
def test_enrich_empty_media(enricher, mocker):
metadata = mocker.Mock()
metadata.media = []
# Should not raise errors
enricher.enrich(metadata)
def test_get_metadata_error_handling(enricher, mocker):
mocker.patch("subprocess.run", side_effect=Exception("Test error"))
mock_log = mocker.patch("loguru.logger.error")
result = enricher.get_metadata("test.jpg")
assert result == {}
assert "Error occurred: " in mock_log.call_args[0][0]
def test_metadata_pickle(enricher, unpickle, mocker):
mock_run = mocker.patch("subprocess.run")
# Uses pickled values
mock_run.return_value = unpickle("metadata_enricher_exif.pickle")
metadata = unpickle("metadata_enricher_ytshort_input.pickle")
expected = unpickle("metadata_enricher_ytshort_expected.pickle")
enricher.enrich(metadata)
expected_media = expected.media
actual_media = metadata.media
assert len(expected_media) == len(actual_media)
assert actual_media[0].properties.get("metadata") == expected_media[0].properties.get("metadata")

View File

@@ -0,0 +1,78 @@
import pytest
from PIL import UnidentifiedImageError
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.pdq_hash_enricher import PdqHashEnricher
@pytest.fixture
def enricher(setup_module):
return setup_module("pdq_hash_enricher", {})
@pytest.fixture
def metadata_with_images():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="image1.jpg", key="image1"))
m.add_media(Media(filename="image2.jpg", key="image2"))
return m
def test_successful_enrich(metadata_with_images, mocker):
with (
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100)),
mocker.patch("PIL.Image.open"),
mocker.patch.object(Media, "is_image", return_value=True) as mock_is_image,
):
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
# Ensure the hash is set for image media
for media in metadata_with_images.media:
assert media.get("pdq_hash") is not None
def test_enrich_skip_non_image(metadata_with_images, mocker):
mocker.patch.object(Media, "is_image", return_value=False)
mock_pdq = mocker.patch("pdqhash.compute")
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
mock_pdq.assert_not_called()
def test_enrich_handles_corrupted_image(metadata_with_images, mocker):
mocker.patch("PIL.Image.open", side_effect=UnidentifiedImageError("Corrupted image"))
mock_pdq = mocker.patch("pdqhash.compute")
mock_logger = mocker.patch("loguru.logger.error")
enricher = PdqHashEnricher()
enricher.enrich(metadata_with_images)
assert mock_logger.call_count == len(metadata_with_images.media)
mock_pdq.assert_not_called()
@pytest.mark.parametrize(
"media_id, should_have_hash",
[
("screenshot", False),
("warc-file-123", False),
("regular-image", True),
]
)
def test_enrich_excludes_by_filetype(media_id, should_have_hash, mocker):
metadata = Metadata()
metadata.set_url("https://example.com")
metadata.add_media(Media(filename="image.jpg").set("id", media_id))
mocker.patch("pdqhash.compute", return_value=([1, 0, 1, 0] * 64, 100))
mocker.patch("PIL.Image.open")
mocker.patch.object(Media, "is_image", return_value=True)
enricher = PdqHashEnricher()
enricher.enrich(metadata)
media_item = metadata.media[0]
assert (media_item.get("pdq_hash") is not None) == should_have_hash

View File

@@ -0,0 +1,195 @@
import base64
import pytest
from selenium.common.exceptions import TimeoutException
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.screenshot_enricher import ScreenshotEnricher
@pytest.fixture
def mock_selenium_env(mocker):
"""Patches Selenium calls and driver checks in one place."""
# Patch external dependencies
mock_which = mocker.patch("shutil.which")
mock_driver_class = mocker.patch("auto_archiver.utils.webdriver.CookieSettingDriver")
mock_binary_paths = mocker.patch("selenium.webdriver.common.selenium_manager.SeleniumManager.binary_paths")
mock_is_file = mocker.patch("pathlib.Path.is_file", return_value=True)
mock_popen = mocker.patch("subprocess.Popen")
mock_is_connectable = mocker.patch("selenium.webdriver.common.service.Service.is_connectable", return_value=True)
mock_firefox_options = mocker.patch("selenium.webdriver.FirefoxOptions")
# Define side effect for `shutil.which`
def mock_which_side_effect(dep):
return "/mock/geckodriver" if dep == "geckodriver" else None
mock_which.side_effect = mock_which_side_effect
# Mock binary paths
mock_binary_paths.return_value = {
"driver_path": "/mock/driver",
"browser_path": "/mock/browser",
}
# Mock `subprocess.Popen`
mock_proc = mocker.MagicMock()
mock_proc.poll.return_value = None
mock_popen.return_value = mock_proc
# Mock `CookieSettingDriver`
mock_driver = mocker.MagicMock()
mock_driver_class.return_value = mock_driver
# Mock `FirefoxOptions`
mock_options_instance = mocker.MagicMock()
mock_firefox_options.return_value = mock_options_instance
yield mock_driver, mock_driver_class, mock_options_instance
@pytest.fixture
def common_patches(tmp_path, mocker):
"""Patches common utilities used across multiple tests."""
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=False)
mocker.patch("os.path.join", return_value=str(tmp_path / "test.png"))
mocker.patch("time.sleep")
yield
@pytest.fixture
def screenshot_enricher(setup_module, mock_binary_dependencies) -> ScreenshotEnricher:
configs: dict = {
"width": 1280,
"height": 720,
"timeout": 60,
"sleep_before_screenshot": 4,
"http_proxy": "",
"save_to_pdf": "False",
"print_options": {},
}
return setup_module("screenshot_enricher", configs)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
def test_enrich_adds_screenshot(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
tmp_path,
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
screenshot_enricher.enrich(metadata_with_video)
mock_driver_class.assert_called_once_with(
cookies=None,
cookiejar=None,
facebook_accept_cookies=False,
options=mock_options_instance,
)
# Verify the actual calls on the returned mock_driver
mock_driver.get.assert_called_once_with("https://example.com")
mock_driver.save_screenshot.assert_called_once_with(str(tmp_path / "test.png"))
# Check that the media was added (2 = original video + screenshot)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
@pytest.mark.parametrize(
"url,is_auth",
[
("https://example.com", False),
("https://private.com", True),
],
)
def test_enrich_auth_wall(
screenshot_enricher,
metadata_with_video,
mock_selenium_env,
common_patches,
url,
is_auth,
mocker
):
# Testing with and without is_auth_wall
mock_driver, mock_driver_class, _ = mock_selenium_env
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=is_auth)
metadata_with_video.set_url(url)
screenshot_enricher.enrich(metadata_with_video)
if is_auth:
mock_driver.get.assert_not_called()
assert len(metadata_with_video.media) == 1
assert metadata_with_video.media[0].properties.get("id") == "video1"
else:
mock_driver.get.assert_called_once_with(url)
assert len(metadata_with_video.media) == 2
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
def test_handle_timeout_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
mock_driver.get.side_effect = TimeoutException
mock_log = mocker.patch("loguru.logger.info")
screenshot_enricher.enrich(metadata_with_video)
mock_log.assert_called_once_with("TimeoutException loading page for screenshot")
assert len(metadata_with_video.media) == 1
def test_handle_general_exception(
screenshot_enricher, metadata_with_video, mock_selenium_env, mocker
):
"""Test proper handling of unexpected general exceptions"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Simulate a generic exception when save_screenshot is called
mock_driver.get.return_value = None
mock_driver.save_screenshot.side_effect = Exception("Unexpected Error")
mock_log = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify that the exception was logged with the log
mock_log.assert_called_once_with(
"Got error while loading webdriver for screenshot enricher: Unexpected Error"
)
# And no new media was added due to the error
assert len(metadata_with_video.media) == 1
def test_pdf_creation(mocker, screenshot_enricher, metadata_with_video, mock_selenium_env):
"""Test PDF creation when save_to_pdf is enabled"""
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
# Override the save_to_pdf option
screenshot_enricher.save_to_pdf = True
# Mock the print_page method to return base64-encoded content
mock_driver.print_page.return_value = base64.b64encode(b"fake_pdf_content").decode("utf-8")
# Patch functions with mocker
mock_os_path_join = mocker.patch("os.path.join", side_effect=lambda *args: f"{args[-1]}")
mock_random_str = mocker.patch(
"auto_archiver.modules.screenshot_enricher.screenshot_enricher.random_str",
return_value="fixed123",
)
mock_open = mocker.patch("builtins.open", new_callable=mocker.mock_open)
mock_log_error = mocker.patch("loguru.logger.error")
screenshot_enricher.enrich(metadata_with_video)
# Verify screenshot and PDF creation
mock_driver.save_screenshot.assert_called_once()
mock_driver.print_page.assert_called_once_with(mock_driver.print_options)
# Check that PDF file was opened and written
mock_open.assert_any_call("pdf_fixed123.pdf", "wb")
# Ensure both screenshot and PDF were added as media
assert len(metadata_with_video.media) == 3
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
assert metadata_with_video.media[2].properties.get("id") == "pdf"
@pytest.fixture(autouse=True)
def cleanup_files(tmp_path):
yield
for file in tmp_path.iterdir():
file.unlink()

View File

@@ -0,0 +1,54 @@
import ssl
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def enricher(setup_module):
configs: dict = {
"skip_when_nothing_archived": "True",
}
return setup_module("ssl_enricher", configs)
@pytest.fixture
def metadata():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media("tests/data/testfile_1.txt"))
m.add_media(Media("tests/data/testfile_2.txt"))
return m
def test_http_raises(metadata, enricher):
metadata.set_url("http://example.com")
with pytest.raises(AssertionError) as exc_info:
enricher.enrich(metadata)
assert "Invalid URL scheme" in str(exc_info.value)
def test_empty_metadata(metadata, enricher):
metadata.media = []
assert enricher.enrich(metadata) is None
def test_ssl_enrich(metadata, enricher, mocker):
mocker.patch("ssl.get_server_certificate", return_value="TEST_CERT")
mock_file = mocker.patch("builtins.open", mocker.mock_open())
media_len_before = len(metadata.media)
enricher.enrich(metadata)
ssl.get_server_certificate.assert_called_once_with(("example.com", 443))
mock_file.assert_called_once_with(f"{enricher.tmp_dir}/example-com.pem", "w")
mock_file().write.assert_called_once_with("TEST_CERT")
assert len(metadata.media) == media_len_before + 1
# Ensure the certificate is added to metadata
assert any(media.filename.endswith("example-com.pem") for media in metadata.media)
def test_ssl_error_handling(enricher, metadata, mocker):
mocker.patch("ssl.get_server_certificate", side_effect=ssl.SSLError("SSL error"))
with pytest.raises(ssl.SSLError, match="SSL error"):
enricher.enrich(metadata)

View File

@@ -0,0 +1,148 @@
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.thumbnail_enricher import ThumbnailEnricher
@pytest.fixture
def thumbnail_enricher(setup_module, mock_binary_dependencies) -> ThumbnailEnricher:
config: dict = {
"thumbnails_per_minute": 60,
"max_thumbnails": 4,
}
return setup_module("thumbnail_enricher", config)
@pytest.fixture
def metadata_with_video():
m = Metadata()
m.set_url("https://example.com")
m.add_media(Media(filename="video.mp4").set("id", "video1"))
return m
@pytest.fixture
def mock_ffmpeg_environment(mocker):
# Mocking all the ffmpeg calls in one place
mock_ffmpeg_input = mocker.patch("ffmpeg.input")
mock_makedirs = mocker.patch("os.makedirs")
mocker.patch.object(Media, "is_video", return_value=True),
mock_probe = mocker.patch(
"ffmpeg.probe",
return_value={
"streams": [
{"codec_type": "video", "duration": "120"}
] # Default 2-minute duration, but can override in tests
},
)
mock_output = mocker.MagicMock()
mock_ffmpeg_input.return_value.filter.return_value.output.return_value = (
mock_output
)
return {
"mock_ffmpeg_input": mock_ffmpeg_input,
"mock_makedirs": mock_makedirs,
"mock_output": mock_output,
"mock_probe": mock_probe,
}
@pytest.mark.parametrize("thumbnails_per_minute, max_thumbnails, expected_count", [
(10, 5, 5), # Capped at max_thumbnails
(1, 10, 2), # Less than max_thumbnails
(60, 7, 7), # Matches exactly
])
def test_enrich_thumbnail_limits(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment,
thumbnails_per_minute, max_thumbnails, expected_count
):
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
thumbnail_enricher.enrich(metadata_with_video)
assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_enrich_handles_probe_failure(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch("ffmpeg.probe", side_effect=Exception("Probe error"))
mocker.patch("os.makedirs")
mock_logger = mocker.patch("loguru.logger.error")
mocker.patch.object(Media, "is_video", return_value=True)
thumbnail_enricher.enrich(metadata_with_video)
# Ensure error was logged
mock_logger.assert_called_with(
f"error getting duration of video video.mp4: Probe error"
)
# Ensure no thumbnails were created
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert thumbnails is None
def test_enrich_skips_non_video_files(thumbnail_enricher, metadata_with_video, mocker):
mocker.patch.object(Media, "is_video", return_value=False)
mock_ffmpeg = mocker.patch("ffmpeg.input")
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg.assert_not_called()
@pytest.mark.parametrize("thumbnails_per_minute,max_thumbnails,expected_count", [
(60, 5, 5), # caught by max
(60, 20, 10), # caught by t/min
(0, 20, 1), # test min caught (1)
(11, 20, 1), # test min caught (1)
(12, 20, 2), # test caught by t/min
])
def test_enrich_handles_short_video(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, thumbnails_per_minute, max_thumbnails, expected_count, mocker
):
# override mock duration
fake_duration = 10
mocker.patch(
"ffmpeg.probe",
return_value={ "streams": [{"codec_type": "video", "duration": str(fake_duration)}]},
)
thumbnail_enricher.thumbnails_per_minute = thumbnails_per_minute
thumbnail_enricher.max_thumbnails = max_thumbnails
thumbnail_enricher.enrich(metadata_with_video)
assert mock_ffmpeg_environment["mock_output"].run.call_count == expected_count
thumbnails = metadata_with_video.media[0].get("thumbnails")
assert len(thumbnails) == expected_count
def test_uses_existing_duration(
thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment
):
metadata_with_video.media[0].set("duration", 60)
thumbnail_enricher.enrich(metadata_with_video)
mock_ffmpeg_environment["mock_probe"].assert_not_called()
assert mock_ffmpeg_environment["mock_output"].run.call_count == 4
def test_enrich_metadata_structure(thumbnail_enricher, metadata_with_video, mock_ffmpeg_environment, mocker):
fake_duration = 120
mocker.patch("ffmpeg.probe", return_value={'streams': [{'codec_type': 'video', 'duration': str(fake_duration)}]})
thumbnail_enricher.thumbnails_per_minute = 2
thumbnail_enricher.max_thumbnails = 4
thumbnail_enricher.enrich(metadata_with_video)
media_item = metadata_with_video.media[0]
thumbnails = media_item.get("thumbnails")
# Assert normal metadata
assert media_item.get("id") == "video1"
assert media_item.get("duration") == fake_duration
# Evenly spaced timestamps
expected_timestamps = ["24.000s", "48.000s", "72.000s", "96.000s"]
assert thumbnails is not None
assert len(thumbnails) == 4
for index, thumbnail in enumerate(thumbnails):
assert thumbnail.filename is not None
assert thumbnail.properties.get("id") == f"thumbnail_{index}"
assert thumbnail.properties.get("timestamp") == expected_timestamps[index]

View File

@@ -0,0 +1,112 @@
import os
from zipfile import ZipFile
import pytest
from auto_archiver.core import Metadata, Media
@pytest.fixture
def wacz_enricher(setup_module, mock_binary_dependencies):
configs: dict = {
"profile": None,
"docker_commands": None,
"timeout": 120,
"extract_media": False,
"extract_screenshot": True,
"socks_proxy_host": None,
"socks_proxy_port": None,
"proxy_server": None,
}
wacz = setup_module("wacz_enricher", configs)
return wacz
def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert not wacz_enricher.docker_in_docker
def test_setup_with_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"WACZ_ENABLE_DOCKER": "1"}, clear=True)
wacz_enricher.setup()
assert wacz_enricher.use_docker
def test_already_ran(wacz_enricher, metadata, mocker):
metadata.add_media(Media("test.wacz"), id="browsertrix")
mock_log = mocker.patch("loguru.logger.info")
assert wacz_enricher.enrich(metadata) is True
assert "WACZ enricher had already been executed" in mock_log.call_args[0][0]
def test_basic_call_execution(wacz_enricher, mocker):
mock_run = mocker.patch("subprocess.run")
mock_run.return_value = mocker.Mock(returncode=0)
metadata = Metadata().set_url("https://example.com")
wacz_enricher.enrich(metadata)
assert mock_run.called
# Checks that the url is passed to the cmd
assert "--url https://example.com" in " ".join(mock_run.call_args[0][0])
def test_download_success(wacz_enricher, mocker) -> None:
"""Test download returns metadata on successful enrichment."""
basic_metadata = Metadata().set_url("https://example.com")
mocker.patch.object(wacz_enricher, "enrich", return_value=True)
result = wacz_enricher.download(basic_metadata)
assert result is not None
assert isinstance(result, Metadata)
assert result.status == "wacz: success"
def test_enrich_already_executed(wacz_enricher, mocker) -> None:
"""Test enrich if already executed."""
mock_log = mocker.patch("loguru.logger.info")
metadata = Metadata().set_url("https://example.com")
media = Media(filename="some_file.wacz")
metadata.add_media(media, id="browsertrix")
result = wacz_enricher.enrich(metadata)
assert result is True
assert "WACZ enricher had already been executed:" in mock_log.call_args[0][0]
def test_enrich_subprocess_exception(wacz_enricher, mocker, tmp_path) -> None:
"""Test enrich returns False when subprocess fails."""
wacz_enricher.tmp_dir = str(tmp_path)
wacz_enricher.extract_media = False
wacz_enricher.extract_screenshot = True
mocker.patch("auto_archiver.utils.misc.random_str", return_value="TESTCOL")
mocker.patch("subprocess.run", side_effect=Exception("fail"))
basic_metadata = Metadata().set_url("https://example.com")
result = wacz_enricher.enrich(basic_metadata)
assert result is False
def test_extract_media(wacz_enricher, metadata, tmp_path, mocker) -> None:
"""Test extract_media_from_wacz extracts screenshot media."""
wacz_enricher.tmp_dir = str(tmp_path)
# Create a *real* zip file so ZipFile won't fail.
wacz_file = tmp_path / "dummy.wacz"
with ZipFile(wacz_file, "w") as zf:
zf.writestr("dummy.txt", "test content")
mocker.patch("os.listdir", return_value=[])
warc_data = (
b"WARC/1.0\r\n"
b"WARC-Type: resource\r\n"
b"Content-Type: image/png\r\n"
b"WARC-Target-URI: http://example.com/image.png\r\n"
b"Content-Length: 12\r\n"
b"\r\n"
b"image-bytes"
b"\r\n\r\nWARC/1.0\r\n\r\n"
)
mock_file = mocker.mock_open(read_data=warc_data)
mocker.patch("builtins.open", mock_file)
metadata.add_media(Media("something.wacz"), "browsertrix")
wacz_enricher.extract_media_from_wacz(metadata, str(wacz_file))
assert len(metadata.media) == 2
assert metadata.media[1].properties.get("id") == "browsertrix-screenshot"

View File

@@ -0,0 +1,168 @@
import json
import requests
import pytest
from auto_archiver.modules.wayback_extractor_enricher import WaybackExtractorEnricher
from auto_archiver.core import Metadata
@pytest.fixture
def mock_is_auth_wall(mocker):
"""Fixture to mock is_auth_wall behavior."""
def _mock_is_auth_wall(return_value: bool):
return mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=return_value)
return _mock_is_auth_wall
@pytest.fixture
def mock_post_success(mocker):
"""Fixture to mock POST requests with a successful response."""
def _mock_post(json_data: dict = None, status_code: int = 200):
json_data = json_data or {"job_id": "job123"}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.post", return_value=resp)
return _mock_post
@pytest.fixture
def mock_get_success(mocker):
"""Fixture to mock GET requests returning a completed archive status."""
def _mock_get(json_data: dict = None, status_code: int = 200):
json_data = json_data or {
"status": "success",
"timestamp": "20250101010101",
"original_url": "https://example.com"
}
resp = mocker.Mock(status_code=status_code)
resp.json.return_value = json_data
return mocker.patch("requests.get", return_value=resp)
return _mock_get
@pytest.fixture
def wayback_extractor_enricher(setup_module) -> WaybackExtractorEnricher:
configs: dict = {
"timeout": 5,
"if_not_archived_within": None,
"key": "somekey",
"secret": "secret",
"proxy_http": None,
"proxy_https": None,
}
return setup_module("wayback_extractor_enricher", configs)
def test_download_success(
wayback_extractor_enricher,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
# Basic metadata to allow merge
metadata = Metadata().set_url("https://example.com")
result = wayback_extractor_enricher.download(metadata)
assert result.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
def test_enrich_auth_wall(wayback_extractor_enricher, metadata, mock_is_auth_wall):
mock_is_auth_wall(True)
result = wayback_extractor_enricher.enrich(metadata)
assert result is None
def test_enrich_already_enriched(wayback_extractor_enricher, metadata):
metadata.set("wayback", "existing")
result = wayback_extractor_enricher.enrich(metadata)
assert result is True
def test_enrich_post_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={"error": "server error"}, status_code=500)
result = wayback_extractor_enricher.enrich(metadata)
assert result is False
assert "Internet archive failed with status of 500" in metadata.get("wayback")
def test_enrich_post_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mocker
):
mock_is_auth_wall(False)
resp = mocker.Mock(status_code=200)
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.post", return_value=resp)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_no_job_id(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success
):
mock_is_auth_wall(False)
mock_post_success(json_data={})
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_success(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success()
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback") == "https://web.archive.org/web/20250101010101/https://example.com"
assert metadata.get("check wayback") == "https://web.archive.org/web/*/https://example.com"
def test_enrich_get_failure(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mock_get_success
):
mock_is_auth_wall(False)
mock_post_success()
mock_get_success(json_data={"status": "failed"}, status_code=400)
assert wayback_extractor_enricher.enrich(metadata) is False
def test_enrich_get_request_exception(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
mocker.patch("requests.get", side_effect=requests.exceptions.RequestException("error"))
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"
def test_enrich_get_json_decode_error(
wayback_extractor_enricher,
metadata,
mock_is_auth_wall,
mock_post_success,
mocker
):
mock_is_auth_wall(False)
mock_post_success()
resp = mocker.Mock()
resp.json.side_effect = json.decoder.JSONDecodeError("msg", "doc", 0)
resp.text = "invalid json"
mocker.patch("requests.get", return_value=resp)
mocker.patch("time.sleep", return_value=None)
# check it still enriches the job_id information
assert wayback_extractor_enricher.enrich(metadata) is True
assert metadata.get("wayback").get("job_id") == "job123"

View File

@@ -0,0 +1,133 @@
import pytest
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.modules.whisper_enricher import WhisperEnricher
TEST_S3_URL = "http://cdn.example.com/test.mp4"
@pytest.fixture
def enricher(mocker):
"""Fixture with mocked S3 and API dependencies"""
config = {
"api_endpoint": "http://testapi",
"api_key": "whisper-key",
"include_srt": False,
"timeout": 5,
"action": "translate",
"steps": {"storages": ["s3_storage"]}
}
mock_s3 = mocker.MagicMock(spec=S3Storage)
mock_s3.get_cdn_url.return_value = TEST_S3_URL
instance = WhisperEnricher()
instance.name = "whisper_enricher"
instance.display_name = "Whisper Enricher"
instance.config_setup({instance.name: config})
# bypassing the setup method and mocking S3 setup
instance.stores = config['steps']['storages']
instance.s3 = mock_s3
yield instance, mock_s3
@pytest.fixture
def metadata():
metadata = Metadata()
metadata.set_url("http://test.url")
metadata.set_title("test title")
return metadata
@pytest.fixture
def mock_requests(mocker):
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
yield mock_requests
def test_successful_job_submission(enricher, metadata, mock_requests, mocker):
"""Test successful media processing with S3 configured"""
whisper, mock_s3 = enricher
# Configure mock S3 URL to match test expectation
mock_s3.get_cdn_url.return_value = TEST_S3_URL
# Create test media with matching CDN URL
m = Media("test.mp4")
m.mimetype = "video/mp4"
m.add_url(mock_s3.get_cdn_url.return_value)
metadata.media = [m]
# Mock the complete API interaction chain
mock_status_response = mocker.MagicMock()
mock_status_response.status_code = 200
mock_status_response.json.return_value = {
"status": "success",
"meta": {}
}
mock_artifacts_response = mocker.MagicMock()
mock_artifacts_response.status_code = 200
mock_artifacts_response.json.return_value = [{
"data": [{"start": 0, "end": 5, "text": "test transcript"}]
}]
# Set up mock response sequence
mock_requests.get.side_effect = [
mock_status_response, # First call: status check
mock_artifacts_response # Second call: artifacts check
]
# Run enrichment (without opening file)
whisper.enrich(metadata)
# Check API interactions
mock_requests.post.assert_called_once_with(
"http://testapi/jobs",
json={"url": "http://cdn.example.com/test.mp4", "type": "translate"},
headers={"Authorization": "Bearer whisper-key"}
)
# Verify job status checks
assert mock_requests.get.call_count == 2
assert "artifact_0_text" in metadata.media[0].get("whisper_model")
assert metadata.media[0].get("whisper_model") == {'artifact_0_text': 'test transcript',
'job_artifacts_check': 'http://testapi/jobs/job123/artifacts',
'job_id': 'job123',
'job_status_check': 'http://testapi/jobs/job123'}
def test_submit_job(enricher, mocker):
"""Test job submission method"""
whisper, _ = enricher
m = Media("test.mp4")
m.add_url(TEST_S3_URL)
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 201
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
job_id = whisper.submit_job(m)
assert job_id == "job123"
def test_submit_raises_status(enricher, mocker):
whisper, _ = enricher
m = Media("test.mp4")
m.add_url(TEST_S3_URL)
mock_requests = mocker.patch("auto_archiver.modules.whisper_enricher.whisper_enricher.requests")
mock_response = mocker.MagicMock()
mock_response.status_code = 400
mock_response.json.return_value = {"id": "job123"}
mock_requests.post.return_value = mock_response
with pytest.raises(AssertionError) as exc_info:
whisper.submit_job(m)
assert str(exc_info.value) == "calling the whisper api http://testapi returned a non-success code: 400"
# @pytest.mark.parametrize("test_url, status", ["http://cdn.example.com/test.mp4",])
def test_submit_job_fails(enricher):
"""Test assertion fails with non-S3 URL"""
whisper, mock_s3 = enricher
m = Media("test.mp4")
m.add_url("http://cdn.wrongurl.com/test.mp4")
with pytest.raises(AssertionError):
whisper.submit_job(m)

View File

@@ -9,6 +9,7 @@ import pytest
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from .test_extractor_base import TestExtractorBase
CI=os.getenv("GITHUB_ACTIONS", '') == 'true'
class TestGenericExtractor(TestExtractorBase):
"""Tests Generic Extractor
"""
@@ -77,10 +78,11 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert not result
@pytest.mark.skipif(CI, reason="Currently no way to authenticate when on CI. Youtube (yt-dlp) doesn't support logging in with username/password.")
@pytest.mark.download
def test_youtube_download(self, make_item):
# url https://www.youtube.com/watch?v=5qap5aO4i9A
item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
result = self.extractor.download(item)
assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
@@ -114,6 +116,7 @@ class TestGenericExtractor(TestExtractorBase):
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_video(self, make_item):
item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579")
@@ -121,18 +124,21 @@ class TestGenericExtractor(TestExtractorBase):
assert len(result.media) == 1
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_no_media(self, make_item):
item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_poll(self, make_item):
item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
result = self.extractor.download(item)
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_single_image(self, make_item):
item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006")
@@ -140,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
assert len(result.media) == 1
assert result is not False
@pytest.mark.skipif(CI, reason="Truth social blocks GH actions.")
@pytest.mark.download
def test_truthsocial_download_multiple_images(self, make_item):
item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")

View File

@@ -1,15 +1,12 @@
from datetime import datetime
from typing import Type
import pytest
from unittest.mock import patch, MagicMock
from auto_archiver.core import Metadata
from auto_archiver.modules.instagram_api_extractor.instagram_api_extractor import InstagramAPIExtractor
from .test_extractor_base import TestExtractorBase
@pytest.fixture
def mock_user_response():
return {
@@ -115,74 +112,74 @@ class TestInstagramAPIExtractor(TestExtractorBase):
# test gets text (metadata title)
pass
def test_download_profile_basic(self, metadata, mock_user_response):
def test_download_profile_basic(self, metadata, mock_user_response, mocker):
"""Test basic profile download without full_profile"""
with patch.object(self.extractor, 'call_api') as mock_call, \
patch.object(self.extractor, 'download_from_url') as mock_download:
# Mock API responses
mock_call.return_value = mock_user_response
mock_download.return_value = "profile.jpg"
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_download = mocker.patch.object(self.extractor, 'download_from_url')
# Mock API responses
mock_call.return_value = mock_user_response
mock_download.return_value = "profile.jpg"
result = self.extractor.download_profile(metadata, "test_user")
assert result.status == "insta profile: success"
assert result.get_title() == "Test User"
assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"])
# Verify profile picture download
mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"})
mock_download.assert_called_once_with("http://example.com/profile.jpg")
assert len(result.media) == 1
assert result.media[0].filename == "profile.jpg"
result = self.extractor.download_profile(metadata, "test_user")
assert result.status == "insta profile: success"
assert result.get_title() == "Test User"
assert result.get("data") == self.extractor.cleanup_dict(mock_user_response["user"])
# Verify profile picture download
mock_call.assert_called_once_with("v2/user/by/username", {"username": "test_user"})
mock_download.assert_called_once_with("http://example.com/profile.jpg")
assert len(result.media) == 1
assert result.media[0].filename == "profile.jpg"
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response):
def test_download_profile_full(self, metadata, mock_user_response, mock_story_response, mocker):
"""Test full profile download with stories/posts"""
with patch.object(self.extractor, 'call_api') as mock_call, \
patch.object(self.extractor, 'download_all_posts') as mock_posts, \
patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \
patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \
patch.object(self.extractor, '_download_stories_reusable') as mock_stories:
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
mock_stories = mocker.patch.object(self.extractor, '_download_stories_reusable')
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
mock_story_response
]
mock_highlights.return_value = None
mock_stories.return_value = mock_story_response
mock_posts.return_value = None
mock_tagged.return_value = None
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
mock_story_response
]
mock_highlights.return_value = None
mock_stories.return_value = mock_story_response
mock_posts.return_value = None
mock_tagged.return_value = None
result = self.extractor.download_profile(metadata, "test_user")
assert result.get("#stories") == len(mock_story_response)
mock_posts.assert_called_once_with(result, "123")
assert "errors" not in result.metadata
result = self.extractor.download_profile(metadata, "test_user")
assert result.get("#stories") == len(mock_story_response)
mock_posts.assert_called_once_with(result, "123")
assert "errors" not in result.metadata
def test_download_profile_not_found(self, metadata):
def test_download_profile_not_found(self, metadata, mocker):
"""Test profile not found error"""
with patch.object(self.extractor, 'call_api') as mock_call:
mock_call.return_value = {"user": None}
with pytest.raises(AssertionError) as exc_info:
self.extractor.download_profile(metadata, "invalid_user")
assert "User invalid_user not found" in str(exc_info.value)
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_call.return_value = {"user": None}
with pytest.raises(AssertionError) as exc_info:
self.extractor.download_profile(metadata, "invalid_user")
assert "User invalid_user not found" in str(exc_info.value)
def test_download_profile_error_handling(self, metadata, mock_user_response):
def test_download_profile_error_handling(self, metadata, mock_user_response, mocker):
"""Test error handling in full profile mode"""
with (patch.object(self.extractor, 'call_api') as mock_call, \
patch.object(self.extractor, 'download_all_highlights') as mock_highlights, \
patch.object(self.extractor, 'download_all_tagged') as mock_tagged, \
patch.object(self.extractor, '_download_stories_reusable') as stories_tagged, \
patch.object(self.extractor, 'download_all_posts') as mock_posts
):
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
Exception("Stories API failed"),
Exception("Posts API failed")
]
mock_highlights.return_value = None
mock_tagged.return_value = None
stories_tagged.return_value = None
mock_posts.return_value = None
result = self.extractor.download_profile(metadata, "test_user")
mock_call = mocker.patch.object(self.extractor, 'call_api')
mock_highlights = mocker.patch.object(self.extractor, 'download_all_highlights')
mock_tagged = mocker.patch.object(self.extractor, 'download_all_tagged')
stories_tagged = mocker.patch.object(self.extractor, '_download_stories_reusable')
mock_posts = mocker.patch.object(self.extractor, 'download_all_posts')
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]
self.extractor.full_profile = True
mock_call.side_effect = [
mock_user_response,
Exception("Stories API failed"),
Exception("Posts API failed")
]
mock_highlights.return_value = None
mock_tagged.return_value = None
stories_tagged.return_value = None
mock_posts.return_value = None
result = self.extractor.download_profile(metadata, "test_user")
assert result.is_success()
assert "Error downloading stories for test_user" in result.metadata["errors"]

View File

@@ -1,94 +1,108 @@
import os
from typing import Type
from unittest.mock import patch, MagicMock
import pytest
from auto_archiver.core import Metadata
from auto_archiver.core.extractor import Extractor
from auto_archiver.modules.instagram_tbot_extractor import InstagramTbotExtractor
from tests.extractors.test_extractor_base import TestExtractorBase
TESTFILES = os.path.join(os.path.dirname(__file__), "testfiles")
@pytest.fixture
def session_file(tmpdir):
"""Fixture to create a test session file."""
session_file = os.path.join(tmpdir, "test_session.session")
with open(session_file, "w") as f:
f.write("mock_session_data")
return session_file.replace(".session", "")
def patch_extractor_methods(request, setup_module, mocker):
mocker.patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None)
mocker.patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None)
yield
@pytest.fixture(autouse=True)
def patch_extractor_methods(request, setup_module):
with patch.object(InstagramTbotExtractor, '_prepare_session_file', return_value=None), \
patch.object(InstagramTbotExtractor, '_initialize_telegram_client', return_value=None):
if hasattr(request, 'cls') and hasattr(request.cls, 'config'):
request.cls.extractor = setup_module("instagram_tbot_extractor", request.cls.config)
yield
@pytest.fixture
def metadata_sample():
m = Metadata()
m.set_title("Test Title")
m.set_timestamp("2021-01-01T00:00:00Z")
m.set_timestamp("2021-01-01T00:00:00")
m.set_url("https://www.instagram.com/p/1234567890")
return m
class TestInstagramTbotExtractor:
@pytest.fixture
def mock_telegram_client(mocker):
"""Fixture to mock TelegramClient interactions."""
mock_client = mocker.patch("auto_archiver.modules.instagram_tbot_extractor.client")
instance = mocker.MagicMock()
mock_client.return_value = instance
return instance
@pytest.fixture
def extractor(setup_module, patch_extractor_methods, mocker):
extractor_module = "instagram_tbot_extractor"
extractor: InstagramTbotExtractor
config = {
"api_id": 12345,
"api_hash": "test_api_hash",
"session_file": "test_session",
"timeout": 4
}
extractor = setup_module(extractor_module, config)
extractor.client = mocker.MagicMock()
extractor.session_file = "test_session"
return extractor
def test_non_instagram_url(extractor, metadata_sample):
metadata_sample.set_url("https://www.youtube.com")
assert extractor.download(metadata_sample) is False
def test_download_success(extractor, metadata_sample, mocker):
mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101))
mocker.patch.object(extractor, "_process_messages", return_value="Sample Instagram post caption")
result = extractor.download(metadata_sample)
assert result.is_success()
assert result.status == "insta-via-bot: success"
assert result.metadata.get("title") == "Sample Instagram post caption"
def test_download_invalid(extractor, metadata_sample, mocker):
mocker.patch.object(extractor, "_send_url_to_bot", return_value=(mocker.MagicMock(), 101))
mocker.patch.object(extractor, "_process_messages", return_value="You must enter a URL to a post")
assert extractor.download(metadata_sample) is False
@pytest.mark.skip(reason="Requires authentication.")
class TestInstagramTbotExtractorReal(TestExtractorBase):
# To run these tests set the TELEGRAM_API_ID and TELEGRAM_API_HASH environment variables, and ensure the session file exists.
# Note these are true at this point in time, but changes to source media could be reason for failure.
extractor_module = "instagram_tbot_extractor"
extractor: InstagramTbotExtractor
config = {
"api_id": os.environ.get("TELEGRAM_API_ID"),
"api_hash": os.environ.get("TELEGRAM_API_HASH"),
"session_file": "secrets/anon-insta",
}
@pytest.fixture
def mock_telegram_client(self):
"""Fixture to mock TelegramClient interactions."""
with patch("auto_archiver.modules.instagram_tbot_extractor._initialize_telegram_client") as mock_client:
instance = MagicMock()
mock_client.return_value = instance
yield instance
def test_extractor_is_initialized(self):
assert self.extractor is not None
@patch("time.sleep")
@pytest.mark.parametrize("url, expected_status, bot_responses", [
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou")]),
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success", [MagicMock(id=101, media=None, message="Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol")]),
# todo tbot not working for stories :(
("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, [MagicMock(id=101, media=None, message="Media not found or unavailable")]),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, []),
("https://www.instagram.com/p/INVALID", False, [MagicMock(id=101, media=None, message="You must enter a URL to a post")]),
@pytest.mark.parametrize("url, expected_status, message, len_media", [
("https://www.instagram.com/p/C4QgLbrIKXG", "insta-via-bot: success",
"Are you new to Bellingcat? - The way we share our investigations is different. 💭\nWe want you to read our story but also learn ou",
6),
("https://www.instagram.com/reel/DEVLK8qoIbg/", "insta-via-bot: success",
"Our volunteer community is at the centre of many incredible Bellingcat investigations and tools. Stephanie Ladel is one such vol",
3),
# instagram tbot not working (potentially intermittently?) for stories - replace with a live story to retest
# ("https://www.instagram.com/stories/bellingcatofficial/3556336382743057476/", False, "Media not found or unavailable"),
# Seems to be working intermittently for highlights
# ("https://www.instagram.com/stories/highlights/17868810693068139/", "insta-via-bot: success", None, 50),
# Marking invalid url as success
("https://www.instagram.com/p/INVALID", "insta-via-bot: success", "Media not found or unavailable", 0),
("https://www.youtube.com/watch?v=ymCMy8OffHM", False, None, 0),
])
def test_download(self, mock_sleep, url, expected_status, bot_responses, metadata_sample):
def test_download(self, url, expected_status, message, len_media, metadata_sample):
"""Test the `download()` method with various Instagram URLs."""
metadata_sample.set_url(url)
self.extractor.client = MagicMock()
result = self.extractor.download(metadata_sample)
pass
# TODO fully mock or use as authenticated test
# if expected_status:
# assert result.is_success()
# assert result.status == expected_status
# assert result.metadata.get("title") in [msg.message[:128] for msg in bot_responses if msg.message]
# else:
# assert result is False
# Test story
# Test expired story
# Test requires login/ access (?)
# Test post
# Test multiple images?
if expected_status:
assert result.is_success()
assert result.status == expected_status
assert result.metadata.get("title") == message
assert len(result.media) == len_media
else:
assert result is False

View File

@@ -23,7 +23,6 @@ class TestTwitterApiExtractor(TestExtractorBase):
}
@pytest.mark.parametrize("url, expected", [
("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"), # t.co URL
("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
@@ -32,7 +31,11 @@ class TestTwitterApiExtractor(TestExtractorBase):
])
def test_sanitize_url(self, url, expected):
assert expected == self.extractor.sanitize_url(url)
@pytest.mark.download
def test_sanitize_url_download(self):
assert "https://www.bellingcat.com/category/resources/" == self.extractor.sanitize_url("https://t.co/yl3oOJatFp")
@pytest.mark.parametrize("url, exptected_username, exptected_tweetid", [
("https://twitter.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),

View File

@@ -0,0 +1,108 @@
import pytest
from auto_archiver.modules.atlos_feeder import AtlosFeeder
class FakeAPIResponse:
"""Simulate a response object."""
def __init__(self, data: dict, raise_error: bool = False) -> None:
self._data = data
self.raise_error = raise_error
def json(self) -> dict:
return self._data
def raise_for_status(self) -> None:
if self.raise_error:
raise Exception("HTTP error")
@pytest.fixture
def atlos_feeder(setup_module) -> AtlosFeeder:
"""Fixture for AtlosFeeder."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_feeder", configs)
@pytest.fixture
def mock_atlos_api(mocker):
"""Fixture to mock requests to Atlos API."""
def _mock_responses(responses):
mocker.patch(
"requests.get",
side_effect=[FakeAPIResponse(data) for data in responses],
)
return _mock_responses
def test_atlos_feeder_iter_yields_valid_metadata(atlos_feeder, mock_atlos_api):
"""Test valid items are yielded and invalid ones ignored."""
mock_atlos_api([
{
"next": None,
"results": [
{"source_url": "http://example.com", "id": 1,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "", "id": 2,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
{"source_url": "http://example.org", "id": 3,
"metadata": {"auto_archiver": {"processed": True}},
"visibility": "visible", "status": "complete"},
],
}
])
items = list(atlos_feeder)
assert len(items) == 1
assert items[0].get_url() == "http://example.com"
assert items[0].get("atlos_id") == 1
def test_atlos_feeder_multiple_pages(atlos_feeder, mock_atlos_api):
"""Test iteration over multiple pages with valid items."""
mock_atlos_api([
{
"next": "cursor2",
"results": [
{"source_url": "http://example1.com", "id": 10,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
{
"next": None,
"results": [
{"source_url": "http://example2.com", "id": 20,
"metadata": {"auto_archiver": {"processed": False}},
"visibility": "visible", "status": "complete"},
],
},
])
items = list(atlos_feeder)
assert len(items) == 2
assert items[0].get_url() == "http://example1.com"
assert items[0].get("atlos_id") == 10
assert items[1].get_url() == "http://example2.com"
assert items[1].get("atlos_id") == 20
def test_atlos_feeder_no_results(atlos_feeder, mock_atlos_api):
"""Test iteration stops when no results are returned."""
mock_atlos_api([{"next": None, "results": []}])
assert list(atlos_feeder) == []
def test_atlos_feeder_http_error(atlos_feeder, mocker):
"""Test raises an exception on HTTP error."""
mocker.patch(
"requests.get",
return_value=FakeAPIResponse({"next": None, "results": []}, raise_error=True),
)
with pytest.raises(Exception, match="HTTP error"):
list(atlos_feeder)

View File

@@ -2,27 +2,23 @@ from typing import Type
import gspread
import pytest
from unittest.mock import patch, MagicMock
from auto_archiver.modules.gsheet_feeder import GsheetsFeeder
from auto_archiver.core import Metadata, Feeder
def test_setup_without_sheet_and_sheet_id(setup_module):
def test_setup_without_sheet_and_sheet_id(setup_module, mocker):
# Ensure setup() raises AssertionError if neither sheet nor sheet_id is set.
with patch("gspread.service_account"):
with pytest.raises(AssertionError):
setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
)
mocker.patch("gspread.service_account")
with pytest.raises(AssertionError):
setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": None},
)
@pytest.fixture
def gsheet_feeder(setup_module) -> GsheetsFeeder:
with patch("gspread.service_account"):
feeder = setup_module(
"gsheet_feeder",
{
def gsheet_feeder(setup_module, mocker) -> GsheetsFeeder:
config: dict = {
"service_account": "dummy.json",
"sheet": "test-auto-archiver",
"sheet_id": None,
@@ -46,9 +42,13 @@ def gsheet_feeder(setup_module) -> GsheetsFeeder:
"allow_worksheets": set(),
"block_worksheets": set(),
"use_sheet_names_in_stored_paths": True,
},
)
feeder.gsheets_client = MagicMock()
}
mocker.patch("gspread.service_account")
feeder = setup_module(
"gsheet_feeder",
config
)
feeder.gsheets_client = mocker.MagicMock()
return feeder
@@ -129,56 +129,56 @@ def test__set_metadata_with_folder(gsheet_feeder: GsheetsFeeder):
],
)
def test_open_sheet_with_name_or_id(
setup_module, sheet, sheet_id, expected_method, expected_arg, description
setup_module, sheet, sheet_id, expected_method, expected_arg, description, mocker
):
"""Ensure open_sheet() correctly opens by name or ID based on configuration."""
with patch("gspread.service_account") as mock_service_account:
mock_client = MagicMock()
mock_service_account.return_value = mock_client
mock_client.open.return_value = "MockSheet"
mock_client.open_by_key.return_value = "MockSheet"
mock_service_account = mocker.patch("gspread.service_account")
mock_client = mocker.MagicMock()
mock_service_account.return_value = mock_client
mock_client.open.return_value = "MockSheet"
mock_client.open_by_key.return_value = "MockSheet"
# Setup module with parameterized values
feeder = setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
)
sheet_result = feeder.open_sheet()
# Validate the correct method was called
getattr(mock_client, expected_method).assert_called_once_with(
expected_arg
), f"Failed: {description}"
assert sheet_result == "MockSheet", f"Failed: {description}"
# Setup module with parameterized values
feeder = setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": sheet, "sheet_id": sheet_id},
)
sheet_result = feeder.open_sheet()
# Validate the correct method was called
getattr(mock_client, expected_method).assert_called_once_with(
expected_arg
), f"Failed: {description}"
assert sheet_result == "MockSheet", f"Failed: {description}"
@pytest.mark.usefixtures("setup_module")
def test_open_sheet_with_sheet_id(setup_module):
def test_open_sheet_with_sheet_id(setup_module, mocker):
"""Ensure open_sheet() correctly opens a sheet by ID."""
with patch("gspread.service_account") as mock_service_account:
mock_client = MagicMock()
mock_service_account.return_value = mock_client
mock_client.open_by_key.return_value = "MockSheet"
feeder = setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
)
sheet = feeder.open_sheet()
mock_client.open_by_key.assert_called_once_with("ABC123")
assert sheet == "MockSheet"
mock_service_account = mocker.patch("gspread.service_account")
mock_client = mocker.MagicMock()
mock_service_account.return_value = mock_client
mock_client.open_by_key.return_value = "MockSheet"
feeder = setup_module(
"gsheet_feeder",
{"service_account": "dummy.json", "sheet": None, "sheet_id": "ABC123"},
)
sheet = feeder.open_sheet()
mock_client.open_by_key.assert_called_once_with("ABC123")
assert sheet == "MockSheet"
def test_should_process_sheet(setup_module):
with patch("gspread.service_account"):
gdb = setup_module(
"gsheet_feeder",
{
"service_account": "dummy.json",
"sheet": "TestSheet",
"sheet_id": None,
"allow_worksheets": {"TestSheet", "Sheet2"},
"block_worksheets": {"Sheet3"},
},
)
def test_should_process_sheet(setup_module, mocker):
mocker.patch("gspread.service_account")
gdb = setup_module(
"gsheet_feeder",
{
"service_account": "dummy.json",
"sheet": "TestSheet",
"sheet_id": None,
"allow_worksheets": {"TestSheet", "Sheet2"},
"block_worksheets": {"Sheet3"},
},
)
assert gdb.should_process_sheet("TestSheet") == True
assert gdb.should_process_sheet("Sheet3") == False
# False if allow_worksheets is set

View File

@@ -1,13 +1,13 @@
# Note this isn't a feeder, but contained as utility of the gsheet feeder module
import pytest
from unittest.mock import MagicMock
from auto_archiver.modules.gsheet_feeder import GWorksheet
class TestGWorksheet:
@pytest.fixture
def mock_worksheet(self):
mock_ws = MagicMock()
def mock_worksheet(self, mocker):
mock_ws = mocker.MagicMock()
mock_ws.get_values.return_value = [
["Link", "Archive Status", "Archive Location", "Archive Date"],
["url1", "archived", "filepath1", "2023-01-01"],
@@ -136,8 +136,8 @@ class TestGWorksheet:
assert gworksheet.to_a1(row, col) == expected
# Test empty worksheet
def test_empty_worksheet_initialization(self):
mock_ws = MagicMock()
def test_empty_worksheet_initialization(self, mocker):
mock_ws = mocker.MagicMock()
mock_ws.get_values.return_value = []
g = GWorksheet(mock_ws)
assert g.headers == []

View File

@@ -1,6 +1,5 @@
from typing import Type
import pytest
from unittest.mock import MagicMock, patch
from auto_archiver.core import Media
from auto_archiver.modules.s3_storage import S3Storage
@@ -11,7 +10,6 @@ class TestS3Storage:
"""
module_name: str = "s3_storage"
storage: Type[S3Storage]
s3: MagicMock
config: dict = {
"path_generator": "flat",
"filename_generator": "static",
@@ -25,13 +23,14 @@ class TestS3Storage:
"private": False,
}
@patch('boto3.client')
@pytest.fixture(autouse=True)
def setup_storage(self, setup_module):
def setup_storage(self, setup_module, mocker):
self.s3 = S3Storage()
self.storage = setup_module(self.module_name, self.config)
def test_client_initialization(self):
"""Test that S3 client is initialized with correct parameters"""
assert self.storage.s3 is not None
assert self.storage.s3.meta.region_name == 'test-region'
@@ -44,81 +43,63 @@ class TestS3Storage:
media.key = "another/path.jpg"
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
def test_uploadf_sets_acl_public(self):
def test_uploadf_sets_acl_public(self, mocker):
media = Media("test.txt")
mock_file = MagicMock()
with patch.object(self.storage.s3, 'upload_fileobj') as mock_s3_upload, \
patch.object(self.storage, 'is_upload_needed', return_value=True):
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key=media.key,
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
)
mock_file = mocker.MagicMock()
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
self.storage.uploadf(mock_file, media)
mock_s3_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key=media.key,
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
)
def test_upload_decision_logic(self):
def test_upload_decision_logic(self, mocker):
"""Test is_upload_needed under different conditions"""
media = Media("test.txt")
# Test default state (random_no_duplicate=False)
assert self.storage.is_upload_needed(media) is True
# Set duplicate checking config to true:
self.storage.random_no_duplicate = True
with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calc_hash, \
patch.object(self.storage, 'file_in_folder') as mock_file_in_folder:
mock_calc_hash.return_value = 'beepboop123beepboop123beepboop123'
mock_file_in_folder.return_value = 'existing_key.txt'
# Test duplicate result
assert self.storage.is_upload_needed(media) is False
assert media.key == 'existing_key.txt'
mock_file_in_folder.assert_called_with(
# (first 24 chars of hash)
'no-dups/beepboop123beepboop123be'
)
mock_calc_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
assert self.storage.is_upload_needed(media) is False
assert media.key == 'existing_key.txt'
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
@patch.object(S3Storage, 'file_in_folder')
def test_skips_upload_when_duplicate_exists(self, mock_file_in_folder):
def test_skips_upload_when_duplicate_exists(self, mocker):
"""Test that upload skips when file_in_folder finds existing object"""
self.storage.random_no_duplicate = True
mock_file_in_folder.return_value = "existing_folder/existing_file.txt"
# Create test media with calculated hash
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
media = Media("test.txt")
media.key = "original_path.txt"
with patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash') as mock_calculate_hash:
mock_calculate_hash.return_value = "beepboop123beepboop123beepboop123"
# Verify upload
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt"
assert media.get("previously archived") is True
with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
result = self.storage.uploadf(None, media)
mock_upload.assert_not_called()
assert result is True
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
assert self.storage.is_upload_needed(media) is False
assert media.key == "existing_folder/existing_file.txt"
assert media.get("previously archived") is True
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
result = self.storage.uploadf(None, media)
mock_upload.assert_not_called()
assert result is True
@patch.object(S3Storage, 'is_upload_needed')
def test_uploads_with_correct_parameters(self, mock_upload_needed):
def test_uploads_with_correct_parameters(self, mocker):
media = Media("test.txt")
media.key = "original_key.txt"
mock_upload_needed.return_value = True
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
media.mimetype = 'image/png'
mock_file = MagicMock()
mock_file = mocker.MagicMock()
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
self.storage.uploadf(mock_file, media)
mock_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key='original_key.txt',
ExtraArgs={
'ACL': 'public-read',
'ContentType': 'image/png'
}
)
with patch.object(self.storage.s3, 'upload_fileobj') as mock_upload:
self.storage.uploadf(mock_file, media)
# verify call occured with these params
mock_upload.assert_called_once_with(
mock_file,
Bucket='test-bucket',
Key='original_key.txt',
ExtraArgs={
'ACL': 'public-read',
'ContentType': 'image/png'
}
)
def test_file_in_folder_exists(self):
with patch.object(self.storage.s3, 'list_objects') as mock_list_objects:
mock_list_objects.return_value = {'Contents': [{'Key': 'path/to/file.txt'}]}
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
def test_file_in_folder_exists(self, mocker):
mock_list_objects = mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'

View File

@@ -0,0 +1,142 @@
import os
import hashlib
import pytest
from auto_archiver.core import Media, Metadata
from auto_archiver.modules.atlos_storage import AtlosStorage
class FakeAPIResponse:
"""Simulate a response object."""
def __init__(self, data: dict, raise_error: bool = False) -> None:
self._data = data
self.raise_error = raise_error
def json(self) -> dict:
return self._data
def raise_for_status(self) -> None:
if self.raise_error:
raise Exception("HTTP error")
@pytest.fixture
def atlos_storage(setup_module) -> AtlosStorage:
"""Fixture for AtlosStorage."""
configs: dict = {
"api_token": "abc123",
"atlos_url": "https://platform.atlos.org",
}
return setup_module("atlos_storage", configs)
@pytest.fixture
def media(tmp_path) -> Media:
"""Fixture for Media."""
content = b"media content"
file_path = tmp_path / "media.txt"
file_path.write_bytes(content)
media = Media(filename=str(file_path))
media.properties = {"something": "Title"}
media.key = "key"
return media
def test_get_cdn_url(atlos_storage: AtlosStorage) -> None:
"""Test get_cdn_url returns the configured atlos_url."""
media = Media(filename="dummy.mp4")
url = atlos_storage.get_cdn_url(media)
assert url == atlos_storage.atlos_url
def test_hash(tmp_path, atlos_storage: AtlosStorage) -> None:
"""Test _hash() computes the correct SHA-256 hash of a file."""
content = b"hello world"
file_path = tmp_path / "test.txt"
file_path.write_bytes(content)
media = Media(filename="dummy.mp4")
media.filename = str(file_path)
expected_hash = hashlib.sha256(content).hexdigest()
assert atlos_storage._hash(media) == expected_hash
def test_upload_no_atlos_id(tmp_path, atlos_storage: AtlosStorage, media: Media, mocker) -> None:
"""Test upload() returns False when metadata lacks atlos_id."""
metadata = Metadata() # atlos_id not set
post_mock = mocker.patch("requests.post")
result = atlos_storage.upload(media, metadata)
assert result is False
post_mock.assert_not_called()
def test_upload_already_uploaded(atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
tmp_path,
mocker) -> None:
"""Test upload() returns True if media hash already exists."""
content = b"media content"
metadata.set("atlos_id", 101)
media_hash = hashlib.sha256(content).hexdigest()
fake_get = FakeAPIResponse({
"result": {"artifacts": [{"file_hash_sha256": media_hash}]}
})
get_mock = mocker.patch("requests.get", return_value=fake_get)
post_mock = mocker.patch("requests.post")
result = atlos_storage.upload(media, metadata)
assert result is True
get_mock.assert_called_once()
post_mock.assert_not_called()
def test_upload_not_uploaded(tmp_path, atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
mocker) -> None:
"""Test upload() uploads media when not already present."""
metadata.set("atlos_id", 202)
fake_get = FakeAPIResponse({
"result": {"artifacts": [{"file_hash_sha256": "different_hash"}]}
})
get_mock = mocker.patch("requests.get", return_value=fake_get)
fake_post = FakeAPIResponse({}, raise_error=False)
post_mock = mocker.patch("requests.post", return_value=fake_post)
result = atlos_storage.upload(media, metadata)
assert result is True
get_mock.assert_called_once()
post_mock.assert_called_once()
expected_url = f"{atlos_storage.atlos_url}/api/v2/source_material/upload/202"
expected_headers = {"Authorization": f"Bearer {atlos_storage.api_token}"}
expected_params = {"title": media.properties}
call_kwargs = post_mock.call_args.kwargs
assert call_kwargs["headers"] == expected_headers
assert call_kwargs["params"] == expected_params
# Verify the URL passed to requests.post.
posted_url = call_kwargs.get("url") or post_mock.call_args.args[0]
assert posted_url == expected_url
# Verify files parameter contains the correct filename.
file_tuple = call_kwargs["files"]["file"]
assert file_tuple[0] == os.path.basename(media.filename)
def test_upload_post_http_error(tmp_path,
atlos_storage: AtlosStorage,
metadata: Metadata,
media: Media,
mocker) -> None:
"""Test upload() propagates HTTP error during POST."""
metadata.set("atlos_id", 303)
fake_get = FakeAPIResponse({
"result": {"artifacts": []}
})
mocker.patch("requests.get", return_value=fake_get)
fake_post = FakeAPIResponse({}, raise_error=True)
mocker.patch("requests.post", return_value=fake_post)
with pytest.raises(Exception, match="HTTP error"):
atlos_storage.upload(media, metadata)
def test_uploadf_not_implemented(atlos_storage: AtlosStorage) -> None:
"""Test uploadf() returns None (not implemented)."""
result = atlos_storage.uploadf(None, "dummy")
assert result is None

View File

@@ -1,44 +1,57 @@
from typing import Type
import pytest
from unittest.mock import MagicMock, patch
from oauth2client import service_account
from auto_archiver.core import Media
from auto_archiver.modules.gdrive_storage import GDriveStorage
from auto_archiver.core.metadata import Metadata
from tests.storages.test_storage_base import TestStorageBase
class TestGDriveStorage:
"""
Test suite for GDriveStorage.
"""
@pytest.fixture
def gdrive_storage(setup_module, mocker):
module_name: str = "gdrive_storage"
storage: Type[GDriveStorage]
storage: GDriveStorage
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
@pytest.fixture(autouse=True)
def gdrive(self, setup_module):
with patch('google.oauth2.service_account.Credentials.from_service_account_file') as mock_creds:
self.storage = setup_module(self.module_name, self.config)
def test_initialize_fails_with_non_existent_creds(self):
"""
Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
"""
# Act and Assert
with pytest.raises(FileNotFoundError) as exc_info:
self.storage.setup()
assert "No such file or directory" in str(exc_info.value)
mocker.patch('google.oauth2.service_account.Credentials.from_service_account_file')
return setup_module(module_name, config)
def test_path_parts(self):
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
def test_initialize_fails_with_non_existent_creds(setup_module):
"""Test that the Google Drive service raises a FileNotFoundError when the service account file does not exist.
(and isn't mocked)
"""
config: dict = {'path_generator': 'url',
'filename_generator': 'static',
'root_folder_id': "fake_root_folder_id",
'oauth_token': None,
'service_account': 'fake_service_account.json'
}
with pytest.raises(FileNotFoundError) as exc_info:
setup_module("gdrive_storage", config)
assert "No such file or directory" in str(exc_info.value)
def test_get_id_from_parent_and_name(gdrive_storage, mocker):
"""Test _get_id_from_parent_and_name returns correct id from an API result."""
fake_list = mocker.MagicMock()
fake_list.execute.return_value = {"files": [{"id": "123", "name": "testname"}]}
fake_service = mocker.MagicMock()
# mock the files.list return value
fake_service.files.return_value.list.return_value = fake_list
gdrive_storage.service = fake_service
result = gdrive_storage._get_id_from_parent_and_name("parent", "mock", retries=1, use_mime_type=False)
assert result == "123"
def test_path_parts():
media = Media(filename="test.jpg")
media.key = "folder1/folder2/test.jpg"
@pytest.mark.skip(reason="Requires real credentials")

View File

@@ -0,0 +1,54 @@
import os
from pathlib import Path
import pytest
from auto_archiver.core import Media
from auto_archiver.modules.local_storage import LocalStorage
@pytest.fixture
def local_storage(setup_module) -> LocalStorage:
configs: dict = {
"path_generator": "flat",
"filename_generator": "static",
"save_to": "./local_archive",
"save_absolute": False,
}
return setup_module("local_storage", configs)
@pytest.fixture
def sample_media(tmp_path) -> Media:
"""Fixture creating a Media object with temporary source file"""
src_file = tmp_path / "source.txt"
src_file.write_text("test content")
return Media(key="subdir/test.txt", filename=str(src_file))
def test_get_cdn_url_relative(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
expected = os.path.join(local_storage.save_to, media.key)
assert local_storage.get_cdn_url(media) == expected
def test_get_cdn_url_absolute(local_storage):
media = Media(key="test.txt", filename="dummy.txt")
local_storage.save_absolute = True
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
assert local_storage.get_cdn_url(media) == expected
def test_upload_file_contents_and_metadata(local_storage, sample_media):
dest = os.path.join(local_storage.save_to, sample_media.key)
assert local_storage.upload(sample_media) is True
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
def test_upload_nonexistent_source(local_storage):
media = Media(key="missing.txt", filename="nonexistent.txt")
with pytest.raises(FileNotFoundError):
local_storage.upload(media)

View File

@@ -60,3 +60,15 @@ def test_run_auto_archiver_empty_file(caplog, autoarchiver, orchestration_file):
# should treat an empty file as if there is no file at all
assert " No URLs provided. Please provide at least one URL via the com" in caplog.text
def test_call_autoarchiver_main(caplog, monkeypatch, tmp_path):
from auto_archiver.__main__ import main
# monkey patch to change the current working directory, so that we don't use the user's real config file
monkeypatch.chdir(tmp_path)
with monkeypatch.context() as m:
m.setattr(sys, "argv", ["auto-archiver"])
with pytest.raises(SystemExit):
main()
assert "No URLs provided. Please provide at least one" in caplog.text

View File

@@ -162,4 +162,25 @@ def test_get_context():
def test_choose_most_complete():
pass
m_more = Metadata()
m_more.set_title("Title 1")
m_more.set_content("Content 1")
m_more.set_url("https://example.com")
m_less = Metadata()
m_less.set_title("Title 2")
m_less.set_content("Content 2")
m_less.set_url("https://example.com")
m_less.set_context("key", "value")
res = Metadata.choose_most_complete([m_more, m_less])
assert res.metadata.get("title") == "Title 1"
def test_choose_most_complete_from_pickles(unpickle):
# test most complete from pickles before and after an enricher has run
# Only compares length of media, not the actual media
m_before_enriching = unpickle("metadata_enricher_ytshort_input.pickle")
m_after_enriching = unpickle("metadata_enricher_ytshort_expected.pickle")
# Iterates `for r in results[1:]:`
res = Metadata.choose_most_complete([Metadata(), m_after_enriching, m_before_enriching])
assert res.media == m_after_enriching.media

View File

@@ -1,24 +1,18 @@
import sys
import pytest
from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES
from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule
@pytest.fixture
def example_module():
import auto_archiver
module_factory = ModuleFactory()
previous_path = auto_archiver.modules.__path__
auto_archiver.modules.__path__.append("tests/data/test_modules/")
module = get_module_lazy("example_module")
yield module
# cleanup
try:
del module._manifest
except AttributeError:
pass
del _LAZY_LOADED_MODULES["example_module"]
sys.modules.pop("auto_archiver.modules.example_module.example_module", None)
auto_archiver.modules.__path__ = previous_path
return module_factory.get_module_lazy("example_module")
def test_get_module_lazy(example_module):
assert example_module.name == "example_module"
@@ -46,12 +40,14 @@ def test_module_dependency_check_loads_module(example_module):
# monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["python"] = ["hash_enricher"]
module_factory = example_module.module_factory
loaded_module = example_module.load({})
assert loaded_module is not None
# check the dependency is loaded
assert _LAZY_LOADED_MODULES["hash_enricher"] is not None
assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None
assert module_factory._lazy_modules["hash_enricher"] is not None
assert module_factory._lazy_modules["hash_enricher"]._instance is not None
def test_load_module(example_module):
@@ -69,7 +65,7 @@ def test_load_module(example_module):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_load_modules(module_name):
# test that specific modules can be loaded
module = get_module_lazy(module_name)
module = ModuleFactory().get_module_lazy(module_name)
assert module is not None
assert isinstance(module, LazyBaseModule)
assert module.name == module_name
@@ -86,7 +82,7 @@ def test_load_modules(module_name):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_lazy_base_module(module_name):
lazy_module = get_module_lazy(module_name)
lazy_module = ModuleFactory().get_module_lazy(module_name)
assert lazy_module is not None
assert isinstance(lazy_module, LazyBaseModule)

View File

@@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError
from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core.module import _LAZY_LOADED_MODULES
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/"
@@ -17,22 +17,7 @@ def test_args():
@pytest.fixture
def orchestrator():
yield ArchivingOrchestrator()
# hack - the loguru logger starts with one logger, but if orchestrator has run before
# it'll remove the default logger, add it back in:
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
# and remove the custom logger
if logger._core.handlers.get(1):
logger.remove(1)
# delete out any loaded modules
_LAZY_LOADED_MODULES.clear()
return ArchivingOrchestrator()
@pytest.fixture
def basic_parser(orchestrator) -> ArgumentParser:
@@ -75,18 +60,36 @@ def test_help(orchestrator, basic_parser, capsys):
orchestrator.show_help(args)
assert exit_error.value.code == 0
assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in capsys.readouterr().out
logs = capsys.readouterr().out
assert "Usage: auto-archiver [--help] [--version] [--config CONFIG_FILE]" in logs
# basic config options
assert "--version" in logs
# setting modules options
assert "--feeders" in logs
assert "--extractors" in logs
# authentication options
assert "--authentication" in logs
# logging options
assert "--logging.level" in logs
# individual module configs
assert "--gsheet_feeder.sheet_id" in logs
def test_add_custom_modules_path(orchestrator, test_args):
orchestrator.run(test_args)
orchestrator.setup_config(test_args)
import auto_archiver
assert "tests/data/test_modules/" in auto_archiver.modules.__path__
def test_add_custom_modules_path_invalid(orchestrator, caplog, test_args):
orchestrator.run(test_args + # we still need to load the real path to get the example_module
orchestrator.setup_config(test_args + # we still need to load the real path to get the example_module
["--module_paths", "tests/data/invalid_test_modules/"])
assert caplog.records[0].message == "Path 'tests/data/invalid_test_modules/' does not exist. Skipping..."
@@ -97,7 +100,7 @@ def test_check_required_values(orchestrator, caplog, test_args):
test_args = test_args[:-2]
with pytest.raises(SystemExit) as exit_error:
orchestrator.run(test_args)
config = orchestrator.setup_config(test_args)
assert caplog.records[1].message == "the following arguments are required: --example_module.required_field"
@@ -111,24 +114,50 @@ def test_get_required_values_from_config(orchestrator, test_args, tmp_path):
store_yaml(test_yaml, tmp_file)
# run the orchestrator
orchestrator.run(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert orchestrator.config is not None
config = orchestrator.setup_config(["--config", tmp_file, "--module_paths", TEST_MODULES])
assert config is not None
def test_load_authentication_string(orchestrator, test_args):
orchestrator.run(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert orchestrator.config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
config = orchestrator.setup_config(test_args + ["--authentication", '{"facebook.com": {"username": "my_username", "password": "my_password"}}'])
assert config['authentication'] == {"facebook.com": {"username": "my_username", "password": "my_password"}}
def test_load_authentication_string_concat_site(orchestrator, test_args):
orchestrator.run(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert orchestrator.config['authentication'] == {"x.com": {"api_key": "my_key"},
config = orchestrator.setup_config(test_args + ["--authentication", '{"x.com,twitter.com": {"api_key": "my_key"}}'])
assert config['authentication'] == {"x.com": {"api_key": "my_key"},
"twitter.com": {"api_key": "my_key"}}
def test_load_invalid_authentication_string(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.run(test_args + ["--authentication", "{\''invalid_json"])
orchestrator.setup_config(test_args + ["--authentication", "{\''invalid_json"])
def test_load_authentication_invalid_dict(orchestrator, test_args):
with pytest.raises(ArgumentTypeError):
orchestrator.run(test_args + ["--authentication", "[true, false]"])
orchestrator.setup_config(test_args + ["--authentication", "[true, false]"])
def test_load_modules_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "example_module", "--extractors", "example_module", "--databases", "example_module", "--enrichers", "example_module", "--formatters", "example_module"]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert len(orchestrator.extractors) == 1
assert len(orchestrator.databases) == 1
assert len(orchestrator.enrichers) == 1
assert len(orchestrator.formatters) == 1
assert orchestrator.feeders[0].name == "example_module"
assert orchestrator.extractors[0].name == "example_module"
assert orchestrator.databases[0].name == "example_module"
assert orchestrator.enrichers[0].name == "example_module"
assert orchestrator.formatters[0].name == "example_module"
def test_load_settings_for_module_from_commandline(orchestrator, test_args):
args = test_args + ["--feeders", "gsheet_feeder", "--gsheet_feeder.sheet_id", "123", "--gsheet_feeder.service_account", "tests/data/test_service_account.json"]
orchestrator.setup(args)
assert len(orchestrator.feeders) == 1
assert orchestrator.feeders[0].name == "gsheet_feeder"
assert orchestrator.config['gsheet_feeder']['sheet_id'] == "123"

144
tests/utils/test_misc.py Normal file
View File

@@ -0,0 +1,144 @@
import hashlib
import json
from datetime import datetime, timezone
import pytest
from auto_archiver.utils.misc import (
mkdir_if_not_exists,
expand_url,
getattr_or,
DateTimeEncoder,
dump_payload,
get_datetime_from_str,
update_nested_dict,
calculate_file_hash,
random_str,
get_timestamp
)
@pytest.fixture
def sample_file(tmp_path):
file_path = tmp_path / "test.txt"
file_path.write_text("test content")
return file_path
class TestDirectoryUtils:
def test_mkdir_creates_new_directory(self, tmp_path):
new_dir = tmp_path / "new_folder"
mkdir_if_not_exists(new_dir)
assert new_dir.exists()
assert new_dir.is_dir()
def test_mkdir_exists_quietly(self, tmp_path):
existing_dir = tmp_path / "existing"
existing_dir.mkdir()
mkdir_if_not_exists(existing_dir)
assert existing_dir.exists()
class TestURLExpansion:
@pytest.mark.parametrize("input_url,expected", [
("https://example.com", "https://example.com"),
("https://t.co/test", "https://expanded.url")
])
def test_expand_url(self, input_url, expected, mocker):
mock_response = mocker.Mock()
mock_response.url = "https://expanded.url"
mocker.patch('requests.get', return_value=mock_response)
result = expand_url(input_url)
assert result == expected
def test_expand_url_handles_errors(self, caplog, mocker):
mocker.patch('requests.get', side_effect=Exception("Connection error"))
url = "https://t.co/error"
result = expand_url(url)
assert result == url
assert f"Failed to expand url {url}" in caplog.text
class TestAttributeHandling:
class Sample:
exists = "value"
none = None
@pytest.mark.parametrize("obj,attr,default,expected", [
(Sample(), "exists", "default", "value"),
(Sample(), "none", "default", "default"),
(Sample(), "missing", "default", "default"),
(None, "anything", "fallback", "fallback"),
])
def test_getattr_or(self, obj, attr, default, expected):
# Test gets attribute or returns a default value
assert getattr_or(obj, attr, default) == expected
class TestDateTimeHandling:
def test_datetime_encoder(self, sample_datetime):
result = json.dumps({"dt": sample_datetime}, cls=DateTimeEncoder)
loaded = json.loads(result)
assert loaded["dt"] == str(sample_datetime)
def test_dump_payload(self, sample_datetime):
payload = {"timestamp": sample_datetime}
result = dump_payload(payload)
assert str(sample_datetime) in result
@pytest.mark.parametrize("dt_str,fmt,expected", [
("2023-01-01 12:00:00+00:00", None, datetime(2023, 1, 1, 12, 0, tzinfo=timezone.utc)),
("20230101 120000", "%Y%m%d %H%M%S", datetime(2023, 1, 1, 12, 0)),
("invalid", None, None),
])
def test_datetime_from_string(self, dt_str, fmt, expected):
result = get_datetime_from_str(dt_str, fmt)
if expected is None:
assert result is None
else:
assert result == expected.replace(tzinfo=result.tzinfo)
class TestDictUtils:
@pytest.mark.parametrize("original,update,expected", [
({"a": 1}, {"b": 2}, {"a": 1, "b": 2}),
({"nested": {"a": 1}}, {"nested": {"b": 2}}, {"nested": {"a": 1, "b": 2}}),
({"a": {"b": {"c": 1}}}, {"a": {"b": {"c": 2}}}, {"a": {"b": {"c": 2}}}),
])
def test_update_nested_dict(self, original, update, expected):
update_nested_dict(original, update)
assert original == expected
class TestHashingUtils:
def test_file_hashing(self, sample_file):
expected = hashlib.sha256(b"test content").hexdigest()
assert calculate_file_hash(str(sample_file)) == expected
def test_large_file_hashing(self, tmp_path):
file_path = tmp_path / "large.bin"
content = b"0" * 16_000_000 * 2 # 32MB
file_path.write_bytes(content)
expected = hashlib.sha256(content).hexdigest()
assert calculate_file_hash(str(file_path)) == expected
class TestMiscUtils:
def test_random_str_length(self):
for length in [8, 16, 32]:
assert len(random_str(length)) == length
def test_random_str_raises_too_long(self):
with pytest.raises(AssertionError) as exc_info:
random_str(64)
assert "length must be less than 32 as UUID4 is used" == str(exc_info.value)
def test_random_str_uniqueness(self):
assert random_str() != random_str()
@pytest.mark.parametrize("ts_input,utc,iso,expected_type", [
(datetime.now(), True, True, str),
("2023-01-01T12:00:00+00:00", False, False, datetime),
(1672574400, True, True, str),
])
def test_timestamp_parsing(self, ts_input, utc, iso, expected_type):
result = get_timestamp(ts_input, utc=utc, iso=iso)
assert isinstance(result, expected_type)
def test_invalid_timestamp_returns_none(self):
assert get_timestamp("invalid-date") is None