mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-07 19:08:30 +03:00
Add documentation, pre-commit hook, more make commands and
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
# Run Ruff formatter on commits.
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.9.10
|
||||
hooks:
|
||||
- id: ruff
|
||||
# args: [ --fix ]
|
||||
- id: ruff-format
|
||||
|
||||
# Runs Ruff linting - just checks without fixing, but blocks commit if errors are found.
|
||||
# - id: ruff
|
||||
# args: ["--output-format=concise"]
|
||||
36
Makefile
36
Makefile
@@ -9,34 +9,54 @@ help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@echo "Additional Commands:"
|
||||
@echo " make test - Run all tests in 'tests/' with pytest"
|
||||
@echo " make lint - Run ruff linter and auto-fix issues"
|
||||
@echo " make ruff-check - Run Ruff linting and formatting checks (safe)"
|
||||
@echo " make ruff-clean - Auto-fix Ruff linting and formatting issues"
|
||||
@echo " make docs - Generate documentation (same as 'make html')"
|
||||
@echo " make clean_docs - Remove generated docs"
|
||||
@echo " make clean-docs - Remove generated docs"
|
||||
@echo " make docker-run - Run the Docker container"
|
||||
@echo " make show-docs - Build and open the documentation in a browser"
|
||||
|
||||
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
@echo "Running tests..."
|
||||
@pytest tests --disable-warnings
|
||||
|
||||
.PHONY: lint
|
||||
lint:
|
||||
@echo "Linting with ruff..."
|
||||
@ruff check --fix .
|
||||
|
||||
.PHONY: ruff-check
|
||||
ruff-check:
|
||||
@echo "Checking code style with Ruff (safe)..."
|
||||
@ruff check .
|
||||
|
||||
|
||||
.PHONY: ruff-clean
|
||||
ruff-clean:
|
||||
@echo "Fixing lint and formatting issues with Ruff..."
|
||||
@ruff check . --fix
|
||||
@ruff format .
|
||||
|
||||
|
||||
.PHONY: docs
|
||||
docs:
|
||||
@echo "Building documentation..."
|
||||
@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)"
|
||||
|
||||
.PHONY: clean_docs
|
||||
clean_docs:
|
||||
|
||||
.PHONY: clean-docs
|
||||
clean-docs:
|
||||
@echo "Cleaning up generated documentation files..."
|
||||
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@rm -rf "$(SOURCEDIR)/autoapi/" "$(SOURCEDIR)/modules/autogen/"
|
||||
@echo "Cleanup complete."
|
||||
|
||||
|
||||
.PHONY: show-docs
|
||||
show-docs:
|
||||
@echo "Opening documentation in browser..."
|
||||
@open "$(BUILDDIR)/html/index.html"
|
||||
|
||||
|
||||
# Run Docker with default settings
|
||||
.PHONY: docker-run
|
||||
docker-run:
|
||||
|
||||
@@ -32,4 +32,5 @@ testing
|
||||
docs
|
||||
release
|
||||
settings_page
|
||||
style_guide
|
||||
```
|
||||
39
docs/source/development/style_guide.md
Normal file
39
docs/source/development/style_guide.md
Normal file
@@ -0,0 +1,39 @@
|
||||
### Style Guide
|
||||
|
||||
The project uses [ruff](https://docs.astral.sh/ruff/) for linting and formatting.
|
||||
Our style configurations are set in the `pyproject.toml` file.
|
||||
|
||||
We have a pre-commit hook to run the formatter before you commit, but Ruff can also be [integrated with most editors](https://docs.astral.sh/ruff/editors/setup/) to run automatically.
|
||||
|
||||
We recommend you also run the linter before pushing code.
|
||||
|
||||
# Running the linter
|
||||
|
||||
We have Makefile commands to run common tasks (Note if you're on Windows you might need to install `make` first, or you can use ruff directly):
|
||||
|
||||
This outputs a report of any issues found:
|
||||
```shell
|
||||
make ruff-check
|
||||
```
|
||||
|
||||
This command will attempt to fix any issues it can:
|
||||
|
||||
⚠️ Warning: This can cause breaking changes. ⚠️
|
||||
|
||||
Ensure you check any modifications by this before committing them.
|
||||
```shell
|
||||
make ruff-fix
|
||||
```
|
||||
|
||||
**Note:** If you're on Windows you might not have `make` installed by default.
|
||||
This is included with [Git for Windows](https://gitforwindows.org/) or you can install make via [Chocolatey](https://chocolatey.org/):
|
||||
```shell
|
||||
choco install make
|
||||
```
|
||||
|
||||
**Running directly with ruff**
|
||||
|
||||
Alternatively, you can run the commands directly with ruff.
|
||||
|
||||
Our rules are quite lenient for general usage, but if you want to explore more rigorous checks you can explore the [ruff documentation](https://docs.astral.sh/ruff/configuration/).
|
||||
You can then run checks to see more nuanced errors which you can review manually.
|
||||
129
poetry.lock
generated
129
poetry.lock
generated
@@ -481,6 +481,18 @@ files = [
|
||||
[package.dependencies]
|
||||
pycparser = "*"
|
||||
|
||||
[[package]]
|
||||
name = "cfgv"
|
||||
version = "3.4.0"
|
||||
description = "Validate configuration and produce human readable error messages."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
|
||||
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.1"
|
||||
@@ -696,6 +708,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"]
|
||||
fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
|
||||
langdetect = ["langdetect (>=1.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "distlib"
|
||||
version = "0.3.9"
|
||||
description = "Distribution utilities"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
|
||||
{file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docutils"
|
||||
version = "0.21.2"
|
||||
@@ -742,6 +766,23 @@ future = "*"
|
||||
[package.extras]
|
||||
dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "filelock"
|
||||
version = "3.17.0"
|
||||
description = "A platform independent file lock."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"},
|
||||
{file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
|
||||
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
|
||||
typing = ["typing-extensions (>=4.12.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "future"
|
||||
version = "1.0.0"
|
||||
@@ -919,6 +960,21 @@ files = [
|
||||
[package.dependencies]
|
||||
pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
|
||||
|
||||
[[package]]
|
||||
name = "identify"
|
||||
version = "2.6.9"
|
||||
description = "File identification library for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "identify-2.6.9-py2.py3-none-any.whl", hash = "sha256:c98b4322da415a8e5a70ff6e51fbc2d2932c015532d77e9f8537b4ba7813b150"},
|
||||
{file = "identify-2.6.9.tar.gz", hash = "sha256:d40dfe3142a1421d8518e3d3985ef5ac42890683e32306ad614a29490abeb6bf"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
license = ["ukkonen"]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.10"
|
||||
@@ -1260,6 +1316,18 @@ rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-bo
|
||||
testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
|
||||
testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "nodeenv"
|
||||
version = "1.9.1"
|
||||
description = "Node.js virtual environment builder"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
|
||||
{file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "2.1.3"
|
||||
@@ -1513,6 +1581,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
|
||||
typing = ["typing-extensions"]
|
||||
xmp = ["defusedxml"]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.6"
|
||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
|
||||
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
|
||||
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
|
||||
type = ["mypy (>=1.11.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.5.0"
|
||||
@@ -1529,6 +1614,25 @@ files = [
|
||||
dev = ["pre-commit", "tox"]
|
||||
testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "pre-commit"
|
||||
version = "4.1.0"
|
||||
description = "A framework for managing and maintaining multi-language pre-commit hooks."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"},
|
||||
{file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cfgv = ">=2.0.0"
|
||||
identify = ">=1.0.0"
|
||||
nodeenv = ">=0.11.1"
|
||||
pyyaml = ">=5.1"
|
||||
virtualenv = ">=20.10.0"
|
||||
|
||||
[[package]]
|
||||
name = "proto-plus"
|
||||
version = "1.26.0"
|
||||
@@ -1902,7 +2006,7 @@ version = "6.0.2"
|
||||
description = "YAML parser and emitter for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["docs"]
|
||||
groups = ["dev", "docs"]
|
||||
files = [
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
|
||||
{file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
|
||||
@@ -2911,6 +3015,27 @@ typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
|
||||
[package.extras]
|
||||
standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]
|
||||
|
||||
[[package]]
|
||||
name = "virtualenv"
|
||||
version = "20.29.3"
|
||||
description = "Virtual Python Environment builder"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "virtualenv-20.29.3-py3-none-any.whl", hash = "sha256:3e3d00f5807e83b234dfb6122bf37cfadf4be216c53a49ac059d02414f819170"},
|
||||
{file = "virtualenv-20.29.3.tar.gz", hash = "sha256:95e39403fcf3940ac45bc717597dba16110b74506131845d9b687d5e73d947ac"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
distlib = ">=0.3.7,<1"
|
||||
filelock = ">=3.12.2,<4"
|
||||
platformdirs = ">=3.9.1,<5"
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
|
||||
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
|
||||
|
||||
[[package]]
|
||||
name = "vk-api"
|
||||
version = "11.9.9"
|
||||
@@ -3213,4 +3338,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "0feae518c3a51717bd80e90eea3cd3ed53925af656f00b662c856bae38a742bb"
|
||||
content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901"
|
||||
|
||||
@@ -65,6 +65,7 @@ autopep8 = "^2.3.1"
|
||||
pytest-loguru = "^0.4.0"
|
||||
pytest-mock = "^3.14.0"
|
||||
ruff = "^0.9.10"
|
||||
pre-commit = "^4.1.0"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
sphinx = "^8.1.3"
|
||||
@@ -96,23 +97,23 @@ markers = [
|
||||
#exclude = ["docs"]
|
||||
line-length = 120
|
||||
# Remove this for a more detailed lint report
|
||||
#output-format = "concise"
|
||||
output-format = "concise"
|
||||
|
||||
|
||||
[tool.ruff.lint]
|
||||
#add bugbear?
|
||||
# I : isort
|
||||
# UP : upgrade, e.g. use fstrings
|
||||
# ANN : annotations
|
||||
extend-select = ["B"]
|
||||
# Extend the rules to check for by adding them to this option:
|
||||
# See documentation for more details: https://docs.astral.sh/ruff/rules/
|
||||
#extend-select = ["B"]
|
||||
|
||||
# Ignore unused imports as some are currently required for lazy loading
|
||||
# This can be removed for a `lint check` run which is manually reviewed
|
||||
ignore = ["F401"]
|
||||
# This can be removed for a `ruff check` run which is manually reviewed
|
||||
#ignore = ["F401"]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
# Ignore import violations in __init__.py files
|
||||
"__init__.py" = ["F401", "F403"]
|
||||
# Ignore 'useless expression' in manifest files.
|
||||
"__manifest__.py" = ["B018"]
|
||||
|
||||
[tool.ruff.format]
|
||||
docstring-code-format = false
|
||||
|
||||
@@ -1,25 +1,19 @@
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
'extractor',
|
||||
'enricher',
|
||||
'database',
|
||||
'storage',
|
||||
'formatter'
|
||||
]
|
||||
|
||||
MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]
|
||||
|
||||
MANIFEST_FILE = "__manifest__.py"
|
||||
|
||||
DEFAULT_MANIFEST = {
|
||||
'name': '', # the display name of the module
|
||||
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
|
||||
'type': [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
'description': '', # a description of the module
|
||||
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
'version': '1.0', # the version of the module
|
||||
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
"name": "", # the display name of the module
|
||||
"author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name!
|
||||
"type": [], # the type of the module, can be one or more of MODULE_TYPES
|
||||
"requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
|
||||
"description": "", # a description of the module
|
||||
"dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format
|
||||
"entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
|
||||
"version": "1.0", # the version of the module
|
||||
"configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
""" Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
"""Orchestrates all archiving steps, including feeding items,
|
||||
archiving them with specific archivers, enrichment, storage,
|
||||
formatting, database operations and clean up.
|
||||
|
||||
"""
|
||||
|
||||
@@ -19,8 +19,17 @@ import requests
|
||||
|
||||
from .metadata import Metadata, Media
|
||||
from auto_archiver.version import __version__
|
||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .config import (
|
||||
read_yaml,
|
||||
store_yaml,
|
||||
to_dot_notation,
|
||||
merge_dicts,
|
||||
is_valid_config,
|
||||
DefaultValidatingParser,
|
||||
UniqueAppendAction,
|
||||
AuthenticationJsonParseAction,
|
||||
DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
@@ -30,8 +39,8 @@ if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
# instance variables
|
||||
module_factory: ModuleFactory
|
||||
setup_finished: bool
|
||||
@@ -61,30 +70,63 @@ class ArchivingOrchestrator:
|
||||
epilog="Check the code at https://github.com/bellingcat/auto-archiver",
|
||||
formatter_class=RichHelpFormatter,
|
||||
)
|
||||
parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
|
||||
parser.add_argument('--version', action='version', version=__version__)
|
||||
parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
|
||||
parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
|
||||
parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
|
||||
parser.add_argument("--version", action="version", version=__version__)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
action="store",
|
||||
dest="config_file",
|
||||
help="the filename of the YAML configuration file (defaults to 'config.yaml')",
|
||||
default=DEFAULT_CONFIG_FILE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
action="store",
|
||||
dest="mode",
|
||||
type=str,
|
||||
choices=["simple", "full"],
|
||||
help="the mode to run the archiver in",
|
||||
default="simple",
|
||||
)
|
||||
# override the default 'help' so we can inject all the configs and show those
|
||||
parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
|
||||
parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--store",
|
||||
dest="store",
|
||||
default=False,
|
||||
help="Store the created config in the config file",
|
||||
action=argparse.BooleanOptionalAction,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--module_paths",
|
||||
dest="module_paths",
|
||||
nargs="+",
|
||||
default=[],
|
||||
help="additional paths to search for modules",
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
self.basic_parser = parser
|
||||
return parser
|
||||
|
||||
|
||||
def check_steps(self, config):
|
||||
for module_type in MODULE_TYPES:
|
||||
if not config['steps'].get(f"{module_type}s", []):
|
||||
if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
|
||||
raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n")
|
||||
if module_type == 'extractor' and config['steps'].get('archivers'):
|
||||
raise SetupError("As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n")
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
if not config["steps"].get(f"{module_type}s", []):
|
||||
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
|
||||
raise SetupError(
|
||||
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
|
||||
)
|
||||
if module_type == "extractor" and config["steps"].get("archivers"):
|
||||
raise SetupError(
|
||||
"As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
|
||||
Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n"
|
||||
)
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
|
||||
|
||||
# modules parser to get the overridden 'steps' values
|
||||
modules_parser = argparse.ArgumentParser(
|
||||
add_help=False,
|
||||
@@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.add_modules_args(modules_parser)
|
||||
cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
|
||||
for module_type in MODULE_TYPES:
|
||||
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
|
||||
yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
|
||||
"steps"
|
||||
].get(f"{module_type}s", [])
|
||||
|
||||
parser = DefaultValidatingParser(
|
||||
add_help=False,
|
||||
@@ -115,30 +159,32 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
enabled_modules = []
|
||||
# first loads the modules from the config file, then from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
|
||||
enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))
|
||||
|
||||
# clear out duplicates, but keep the order
|
||||
enabled_modules = list(dict.fromkeys(enabled_modules))
|
||||
avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
|
||||
avail_modules = self.module_factory.available_modules(
|
||||
limit_to_modules=enabled_modules, suppress_warnings=True
|
||||
)
|
||||
self.add_individual_module_args(avail_modules, parser)
|
||||
elif basic_config.mode == 'simple':
|
||||
elif basic_config.mode == "simple":
|
||||
simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
|
||||
self.add_individual_module_args(simple_modules, parser)
|
||||
|
||||
# add them to the config
|
||||
for module in simple_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
else:
|
||||
# load all modules, they're not using the 'simple' mode
|
||||
all_modules = self.module_factory.available_modules()
|
||||
# add all the modules to the steps
|
||||
for module in all_modules:
|
||||
for module_type in module.type:
|
||||
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
|
||||
yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
|
||||
|
||||
self.add_individual_module_args(all_modules, parser)
|
||||
|
||||
|
||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||
|
||||
# reload the parser with the new arguments, now that we have them
|
||||
@@ -164,43 +210,76 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
store_yaml(config, basic_config.config_file)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def add_modules_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
# Module loading from the command line
|
||||
for module_type in MODULE_TYPES:
|
||||
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
|
||||
parser.add_argument(
|
||||
f"--{module_type}s",
|
||||
dest=f"{module_type}s",
|
||||
nargs="+",
|
||||
help=f"the {module_type}s to use",
|
||||
default=[],
|
||||
action=UniqueAppendAction,
|
||||
)
|
||||
|
||||
def add_additional_args(self, parser: argparse.ArgumentParser = None):
|
||||
if not parser:
|
||||
parser = self.parser
|
||||
|
||||
parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
|
||||
parser.add_argument(
|
||||
"--authentication",
|
||||
dest="authentication",
|
||||
help="A dictionary of sites and their authentication methods \
|
||||
(token, username etc.) that extractors can use to log into \
|
||||
a website. If passing this on the command line, use a JSON string. \
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.',
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction)
|
||||
You may also pass a path to a valid JSON/YAML file which will be parsed.",
|
||||
default={},
|
||||
nargs="?",
|
||||
action=AuthenticationJsonParseAction,
|
||||
)
|
||||
|
||||
# logging arguments
|
||||
parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
|
||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||
|
||||
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
|
||||
parser.add_argument(
|
||||
"--logging.level",
|
||||
action="store",
|
||||
dest="logging.level",
|
||||
choices=["INFO", "DEBUG", "ERROR", "WARNING"],
|
||||
help="the logging level to use",
|
||||
default="INFO",
|
||||
type=str.upper,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--logging.rotation",
|
||||
action="store",
|
||||
dest="logging.rotation",
|
||||
help="the logging rotation to use",
|
||||
default=None,
|
||||
)
|
||||
|
||||
def add_individual_module_args(
|
||||
self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
|
||||
) -> None:
|
||||
if not modules:
|
||||
modules = self.module_factory.available_modules()
|
||||
|
||||
|
||||
for module in modules:
|
||||
if module.name == 'cli_feeder':
|
||||
if module.name == "cli_feeder":
|
||||
# special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
|
||||
parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
|
||||
parser.add_argument(
|
||||
"urls",
|
||||
nargs="*",
|
||||
default=[],
|
||||
help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if not module.configs:
|
||||
# this module has no configs, don't show anything in the help
|
||||
# (TODO: do we want to show something about this module though, like a description?)
|
||||
@@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||
|
||||
for name, kwargs in module.configs.items():
|
||||
if not kwargs.get('metavar', None):
|
||||
if not kwargs.get("metavar", None):
|
||||
# make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
|
||||
kwargs['metavar'] = name.upper()
|
||||
kwargs["metavar"] = name.upper()
|
||||
|
||||
if kwargs.get('required', False):
|
||||
if kwargs.get("required", False):
|
||||
# required args shouldn't have a 'default' value, remove it
|
||||
kwargs.pop('default', None)
|
||||
kwargs.pop("default", None)
|
||||
|
||||
kwargs.pop('cli_set', None)
|
||||
should_store = kwargs.pop('should_store', False)
|
||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
kwargs.pop("cli_set", None)
|
||||
should_store = kwargs.pop("should_store", False)
|
||||
kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||
try:
|
||||
kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
|
||||
kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
|
||||
except AttributeError:
|
||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||
kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
|
||||
arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
|
||||
arg.should_store = should_store
|
||||
|
||||
@@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.basic_parser.exit()
|
||||
|
||||
def setup_logging(self, config):
|
||||
logging_config = config["logging"]
|
||||
|
||||
logging_config = config['logging']
|
||||
|
||||
if logging_config.get('enabled', True) is False:
|
||||
if logging_config.get("enabled", True) is False:
|
||||
# disabled logging settings, they're set on a higher level
|
||||
logger.disable('auto_archiver')
|
||||
logger.disable("auto_archiver")
|
||||
return
|
||||
|
||||
# setup loguru logging
|
||||
@@ -253,38 +331,45 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
pass
|
||||
|
||||
# add other logging info
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
|
||||
if log_file := logging_config['file']:
|
||||
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
|
||||
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
|
||||
self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
|
||||
if log_file := logging_config["file"]:
|
||||
logger.add(log_file) if not logging_config["rotation"] else logger.add(
|
||||
log_file, rotation=logging_config["rotation"]
|
||||
)
|
||||
|
||||
def install_modules(self, modules_by_type):
|
||||
"""
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
|
||||
orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
|
||||
are loaded, the program will exit with an error message.
|
||||
"""
|
||||
|
||||
invalid_modules = []
|
||||
for module_type in MODULE_TYPES:
|
||||
|
||||
step_items = []
|
||||
modules_to_load = modules_by_type[f"{module_type}s"]
|
||||
if not modules_to_load:
|
||||
raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
|
||||
raise SetupError(
|
||||
f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
|
||||
)
|
||||
|
||||
def check_steps_ok():
|
||||
if not len(step_items):
|
||||
if len(modules_to_load):
|
||||
logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
|
||||
raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
|
||||
|
||||
logger.error(
|
||||
f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
|
||||
)
|
||||
raise SetupError(
|
||||
f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
|
||||
)
|
||||
|
||||
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
|
||||
raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||
if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
|
||||
raise SetupError(
|
||||
f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
|
||||
)
|
||||
|
||||
for module in modules_to_load:
|
||||
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
@@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
if loaded_module and module_type == 'extractor':
|
||||
if loaded_module and module_type == "extractor":
|
||||
loaded_module.cleanup()
|
||||
raise e
|
||||
|
||||
@@ -308,11 +393,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
|
||||
logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
|
||||
logger.error(
|
||||
f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
|
||||
)
|
||||
raise FileNotFoundError(f"Configuration file {config_file} not found")
|
||||
|
||||
return read_yaml(config_file)
|
||||
|
||||
|
||||
def setup_config(self, args: list) -> dict:
|
||||
"""
|
||||
Sets up the configuration file, merging the default config with the user's config
|
||||
@@ -335,13 +422,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
yaml_config = self.load_config(basic_config.config_file)
|
||||
|
||||
return self.setup_complete_parser(basic_config, yaml_config, unused_args)
|
||||
|
||||
|
||||
def check_for_updates(self):
|
||||
response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
|
||||
latest_version = response['info']['version']
|
||||
latest_version = response["info"]["version"]
|
||||
# check version compared to current version
|
||||
if latest_version != __version__:
|
||||
if os.environ.get('RUNNING_IN_DOCKER'):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
|
||||
else:
|
||||
update_cmd = "`pip install --upgrade auto-archiver`"
|
||||
@@ -351,33 +438,36 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
|
||||
logger.warning("")
|
||||
|
||||
|
||||
def setup(self, args: list):
|
||||
"""
|
||||
Function to configure all setup of the orchestrator: setup configs and load modules.
|
||||
|
||||
|
||||
This method should only ever be called once
|
||||
"""
|
||||
|
||||
self.check_for_updates()
|
||||
|
||||
if self.setup_finished:
|
||||
logger.warning("The `setup_config()` function should only ever be run once. \
|
||||
logger.warning(
|
||||
"The `setup_config()` function should only ever be run once. \
|
||||
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
|
||||
For code implementatations, you should call .setup_config() once then you may call .feed() \
|
||||
multiple times to archive multiple URLs.")
|
||||
multiple times to archive multiple URLs."
|
||||
)
|
||||
return
|
||||
|
||||
self.setup_basic_parser()
|
||||
self.config = self.setup_config(args)
|
||||
|
||||
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||
self.install_modules(self.config['steps'])
|
||||
self.install_modules(self.config["steps"])
|
||||
|
||||
# log out the modules that were loaded
|
||||
for module_type in MODULE_TYPES:
|
||||
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
|
||||
|
||||
logger.info(
|
||||
f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
|
||||
)
|
||||
|
||||
self.setup_finished = True
|
||||
|
||||
def _command_line_run(self, args: list) -> Generator[Metadata]:
|
||||
@@ -385,9 +475,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
This is the main entry point for the orchestrator, when run from the command line.
|
||||
|
||||
:param args: list of arguments to pass to the orchestrator - these are the command line args
|
||||
|
||||
|
||||
You should not call this method from code implementations.
|
||||
|
||||
|
||||
This method sets up the configuration, loads the modules, and runs the feed.
|
||||
If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
|
||||
To test configurations, without loading any modules you can also first call 'setup_configs'
|
||||
@@ -396,7 +486,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.setup(args)
|
||||
return self.feed()
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(e, exc_info=True)
|
||||
exit(1)
|
||||
|
||||
def cleanup(self) -> None:
|
||||
@@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
e.cleanup()
|
||||
|
||||
def feed(self) -> Generator[Metadata]:
|
||||
|
||||
url_count = 0
|
||||
for feeder in self.feeders:
|
||||
for item in feeder:
|
||||
@@ -436,7 +525,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
self.cleanup()
|
||||
exit()
|
||||
except Exception as e:
|
||||
logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
|
||||
logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
|
||||
for d in self.databases:
|
||||
if isinstance(e, AssertionError):
|
||||
d.failed(item, str(e))
|
||||
@@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
|
||||
def archive(self, result: Metadata) -> Union[Metadata, None]:
|
||||
"""
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
Runs the archiving process for a single URL
|
||||
1. Each archiver can sanitize its own URLs
|
||||
2. Check for cached results in Databases, and signal start to the databases
|
||||
3. Call Archivers until one succeeds
|
||||
4. Call Enrichers
|
||||
5. Store all downloaded/generated media
|
||||
6. Call selected Formatter and store formatted if needed
|
||||
"""
|
||||
|
||||
original_url = result.get_url().strip()
|
||||
@@ -528,7 +617,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def setup_authentication(self, config: dict) -> dict:
|
||||
"""
|
||||
@@ -537,7 +625,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
Split up strings into multiple sites if they are comma separated
|
||||
"""
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
authentication = config.get("authentication", {})
|
||||
|
||||
# extract out concatenated sites
|
||||
for key, val in copy(authentication).items():
|
||||
@@ -546,8 +634,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
site = site.strip()
|
||||
authentication[site] = val
|
||||
del authentication[key]
|
||||
|
||||
config['authentication'] = authentication
|
||||
|
||||
config["authentication"] = authentication
|
||||
return config
|
||||
|
||||
# Helper Properties
|
||||
|
||||
@@ -32,16 +32,16 @@ from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
|
||||
class Storage(BaseModule):
|
||||
|
||||
"""
|
||||
Base class for implementing storage modules in the media archiving framework.
|
||||
|
||||
Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
|
||||
"""
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
|
||||
@@ -73,18 +73,18 @@ class Storage(BaseModule):
|
||||
This method should not be called directly, but instead be called through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
|
||||
with open(media.filename, "rb") as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
|
||||
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
# media key is already set
|
||||
return
|
||||
|
||||
folder = metadata.get_context('folder', '')
|
||||
folder = metadata.get_context("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
@@ -104,12 +104,11 @@ class Storage(BaseModule):
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
# load the hash_enricher module
|
||||
he = self.module_factory.get_module("hash_enricher", self.config)
|
||||
he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
media._key = key
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
media._key = key
|
||||
|
||||
@@ -12,9 +12,7 @@
|
||||
"default": None,
|
||||
"help": "the id of the sheet to archive (alternative to 'sheet' config)",
|
||||
},
|
||||
"header": {"default": 1,
|
||||
"help": "index of the header row (starts at 1)",
|
||||
"type": "int"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
|
||||
"service_account": {
|
||||
"default": "secrets/service_account.json",
|
||||
"help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import shutil
|
||||
from typing import IO
|
||||
import os
|
||||
@@ -8,12 +7,13 @@ from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
||||
|
||||
def setup(self) -> None:
|
||||
if len(self.save_to) > 200:
|
||||
raise SetupError("Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
|
||||
raise SetupError(
|
||||
"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path."
|
||||
)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
dest = media.key
|
||||
@@ -25,18 +25,18 @@ class LocalStorage(Storage):
|
||||
def set_key(self, media, url, metadata):
|
||||
# clarify we want to save the file to the save_to folder
|
||||
|
||||
old_folder = metadata.get('folder', '')
|
||||
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
|
||||
old_folder = metadata.get("folder", "")
|
||||
metadata.set_context("folder", os.path.join(self.save_to, metadata.get("folder", "")))
|
||||
super().set_key(media, url, metadata)
|
||||
# don't impact other storages that might want a different 'folder' set
|
||||
metadata.set_context('folder', old_folder)
|
||||
metadata.set_context("folder", old_folder)
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
dest = media.key
|
||||
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")
|
||||
|
||||
res = shutil.copy2(media.filename, dest)
|
||||
logger.info(res)
|
||||
@@ -44,4 +44,4 @@ class LocalStorage(Storage):
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
pass
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
from typing import IO
|
||||
|
||||
import boto3
|
||||
@@ -11,18 +10,20 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage):
|
||||
|
||||
class S3Storage(Storage):
|
||||
def setup(self) -> None:
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
"s3",
|
||||
region_name=self.region,
|
||||
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||
aws_access_key_id=self.key,
|
||||
aws_secret_access_key=self.secret
|
||||
aws_secret_access_key=self.secret,
|
||||
)
|
||||
if self.random_no_duplicate:
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
logger.warning(
|
||||
"random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
|
||||
)
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
@@ -32,13 +33,13 @@ class S3Storage(Storage):
|
||||
return True
|
||||
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
if not self.private and "ACL" not in extra_args:
|
||||
extra_args["ACL"] = "public-read"
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
if "ContentType" not in extra_args:
|
||||
try:
|
||||
if media.mimetype:
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
extra_args["ContentType"] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
@@ -50,21 +51,21 @@ class S3Storage(Storage):
|
||||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
if existing_key := self.file_in_folder(path):
|
||||
media._key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
def file_in_folder(self, path: str) -> str:
|
||||
# checks if path exists and is not an empty folder
|
||||
if not path.endswith('/'):
|
||||
path = path + '/'
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
return False
|
||||
if not path.endswith("/"):
|
||||
path = path + "/"
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
|
||||
if "Contents" in resp:
|
||||
return resp["Contents"][0]["Key"]
|
||||
return False
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .tiktok_tikwm_extractor import TiktokTikwmExtractor
|
||||
from .tiktok_tikwm_extractor import TiktokTikwmExtractor
|
||||
|
||||
@@ -2,10 +2,7 @@
|
||||
"name": "Tiktok Tikwm Extractor",
|
||||
"type": ["extractor"],
|
||||
"requires_setup": False,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"bin": []
|
||||
},
|
||||
"dependencies": {"python": ["loguru", "requests"], "bin": []},
|
||||
"description": """
|
||||
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
|
||||
|
||||
@@ -19,5 +16,5 @@
|
||||
- If tikwm.com is down, this extractor will not work.
|
||||
- If tikwm.com changes their API, this extractor may break.
|
||||
- If no video is found, this extractor will consider the extraction failed.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -12,11 +12,12 @@ class TiktokTikwmExtractor(Extractor):
|
||||
"""
|
||||
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
|
||||
"""
|
||||
|
||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||
|
||||
def download(self, item: Metadata) -> bool | Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
|
||||
if not re.match(TikTokIE._VALID_URL, url):
|
||||
return False
|
||||
|
||||
@@ -33,7 +34,7 @@ class TiktokTikwmExtractor(Extractor):
|
||||
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
|
||||
return False
|
||||
|
||||
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
|
||||
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
||||
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
|
||||
return False
|
||||
|
||||
@@ -67,7 +68,7 @@ class TiktokTikwmExtractor(Extractor):
|
||||
if created_at := api_data.pop("create_time", None):
|
||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||
|
||||
if (author := api_data.pop("author", None)):
|
||||
if author := api_data.pop("author", None):
|
||||
result.set("author", author)
|
||||
|
||||
result.set("api_data", api_data)
|
||||
|
||||
@@ -14,9 +14,7 @@
|
||||
"help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
|
||||
},
|
||||
"docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120,
|
||||
"help": "timeout for WACZ generation in seconds",
|
||||
"type": "int"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
"extract_media": {
|
||||
"default": False,
|
||||
"type": "bool",
|
||||
|
||||
@@ -22,7 +22,9 @@ TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]
|
||||
|
||||
@pytest.fixture
|
||||
def setup_module(request):
|
||||
def _setup_module(module_name, config={}):
|
||||
def _setup_module(module_name, config=None):
|
||||
if config is None:
|
||||
config = {}
|
||||
module_factory = ModuleFactory()
|
||||
|
||||
if isinstance(module_name, type):
|
||||
|
||||
@@ -24,17 +24,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
|
||||
return mock_get, mock_logger
|
||||
|
||||
@pytest.mark.parametrize("url,valid_url", [
|
||||
("https://bellingcat.com", False),
|
||||
("https://youtube.com", False),
|
||||
("https://tiktok.co/", False),
|
||||
("https://tiktok.com/", False),
|
||||
("https://www.tiktok.com/", False),
|
||||
("https://api.cool.tiktok.com/", False),
|
||||
(VALID_EXAMPLE_URL, True),
|
||||
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"url,valid_url",
|
||||
[
|
||||
("https://bellingcat.com", False),
|
||||
("https://youtube.com", False),
|
||||
("https://tiktok.co/", False),
|
||||
("https://tiktok.com/", False),
|
||||
("https://www.tiktok.com/", False),
|
||||
("https://api.cool.tiktok.com/", False),
|
||||
(VALID_EXAMPLE_URL, True),
|
||||
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
|
||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||
],
|
||||
)
|
||||
def test_valid_urls(self, mocker, make_item, url, valid_url):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
if valid_url:
|
||||
@@ -53,17 +56,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_logger.error.assert_called_once()
|
||||
assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")
|
||||
|
||||
mock_get.return_value.json.side_effect = Exception
|
||||
with pytest.raises(Exception):
|
||||
mock_get.return_value.json.side_effect = ValueError
|
||||
with pytest.raises(ValueError):
|
||||
self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
mock_get.assert_called()
|
||||
assert mock_get.call_count == 2
|
||||
assert mock_get.return_value.json.call_count == 2
|
||||
|
||||
@pytest.mark.parametrize("response", [
|
||||
({"msg": "failure"}),
|
||||
({"msg": "success"}),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"response",
|
||||
[
|
||||
({"msg": "failure"}),
|
||||
({"msg": "success"}),
|
||||
],
|
||||
)
|
||||
def test_unsuccessful_responses(self, mocker, make_item, response):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
@@ -74,11 +80,14 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
mock_logger.error.assert_called_once()
|
||||
assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")
|
||||
|
||||
@pytest.mark.parametrize("response,has_vid", [
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"response,has_vid",
|
||||
[
|
||||
({"data": {"id": 123}}, False),
|
||||
({"data": {"wmplay": "url"}}, True),
|
||||
({"data": {"play": "url"}}, True),
|
||||
],
|
||||
)
|
||||
def test_correct_extraction(self, mocker, make_item, response, has_vid):
|
||||
mock_get, mock_logger = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
@@ -102,16 +111,19 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
def test_correct_data_extracted(self, mocker, make_item):
|
||||
mock_get, _ = self.get_mockers(mocker)
|
||||
mock_get.return_value.status_code = 200
|
||||
mock_get.return_value.json.return_value = {"msg": "success", "data": {
|
||||
"wmplay": "url",
|
||||
"origin_cover": "cover.jpg",
|
||||
"title": "Title",
|
||||
"id": 123,
|
||||
"duration": 60,
|
||||
"create_time": 1736301699,
|
||||
"author": "Author",
|
||||
"other": "data"
|
||||
}}
|
||||
mock_get.return_value.json.return_value = {
|
||||
"msg": "success",
|
||||
"data": {
|
||||
"wmplay": "url",
|
||||
"origin_cover": "cover.jpg",
|
||||
"title": "Title",
|
||||
"id": 123,
|
||||
"duration": 60,
|
||||
"create_time": 1736301699,
|
||||
"author": "Author",
|
||||
"other": "data",
|
||||
},
|
||||
}
|
||||
|
||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||
assert result.is_success()
|
||||
@@ -129,9 +141,12 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
result = self.extractor.download(make_item(url))
|
||||
assert result.is_success()
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
|
||||
assert (
|
||||
result.get_title()
|
||||
== "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
|
||||
)
|
||||
assert result.get("author").get("unique_id") == "bbcnews"
|
||||
assert result.get("api_data").get("id") == '7478038212070411542'
|
||||
assert result.get("api_data").get("id") == "7478038212070411542"
|
||||
assert result.media[1].get("duration") == 59
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
|
||||
|
||||
@@ -149,6 +164,6 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
||||
assert len(result.media) == 2
|
||||
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
|
||||
assert result.get("author").get("id") == "7197400619475649562"
|
||||
assert result.get("api_data").get("id") == '7441821351142362375'
|
||||
assert result.get("api_data").get("id") == "7441821351142362375"
|
||||
assert result.media[1].get("duration") == 34
|
||||
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
|
||||
|
||||
@@ -8,6 +8,7 @@ class TestS3Storage:
|
||||
"""
|
||||
Test suite for S3Storage.
|
||||
"""
|
||||
|
||||
module_name: str = "s3_storage"
|
||||
storage: Type[S3Storage]
|
||||
config: dict = {
|
||||
@@ -32,10 +33,10 @@ class TestS3Storage:
|
||||
"""Test that S3 client is initialized with correct parameters"""
|
||||
|
||||
assert self.storage.s3 is not None
|
||||
assert self.storage.s3.meta.region_name == 'test-region'
|
||||
assert self.storage.s3.meta.region_name == "test-region"
|
||||
|
||||
def test_get_cdn_url_generation(self):
|
||||
"""Test CDN URL formatting """
|
||||
"""Test CDN URL formatting"""
|
||||
media = Media("test.txt")
|
||||
media._key = "path/to/file.txt"
|
||||
url = self.storage.get_cdn_url(media)
|
||||
@@ -46,14 +47,14 @@ class TestS3Storage:
|
||||
def test_uploadf_sets_acl_public(self, mocker):
|
||||
media = Media("test.txt")
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
|
||||
mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_s3_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket='test-bucket',
|
||||
Bucket="test-bucket",
|
||||
Key=media.key,
|
||||
ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
|
||||
)
|
||||
|
||||
def test_upload_decision_logic(self, mocker):
|
||||
@@ -61,23 +62,29 @@ class TestS3Storage:
|
||||
media = Media("test.txt")
|
||||
assert self.storage.is_upload_needed(media) is True
|
||||
self.storage.random_no_duplicate = True
|
||||
mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == 'existing_key.txt'
|
||||
mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
|
||||
assert media.key == "existing_key.txt"
|
||||
mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")
|
||||
|
||||
def test_skips_upload_when_duplicate_exists(self, mocker):
|
||||
"""Test that upload skips when file_in_folder finds existing object"""
|
||||
self.storage.random_no_duplicate = True
|
||||
mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
|
||||
mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt")
|
||||
media = Media("test.txt")
|
||||
media._key = "original_path.txt"
|
||||
mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
|
||||
mocker.patch(
|
||||
"auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
|
||||
return_value="beepboop123beepboop123beepboop123",
|
||||
)
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == "existing_folder/existing_file.txt"
|
||||
assert media.get("previously archived") is True
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
result = self.storage.uploadf(None, media)
|
||||
mock_upload.assert_not_called()
|
||||
assert result is True
|
||||
@@ -85,21 +92,18 @@ class TestS3Storage:
|
||||
def test_uploads_with_correct_parameters(self, mocker):
|
||||
media = Media("test.txt")
|
||||
media._key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
|
||||
media.mimetype = 'image/png'
|
||||
mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
|
||||
media.mimetype = "image/png"
|
||||
mock_file = mocker.MagicMock()
|
||||
mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
|
||||
mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
|
||||
self.storage.uploadf(mock_file, media)
|
||||
mock_upload.assert_called_once_with(
|
||||
mock_file,
|
||||
Bucket='test-bucket',
|
||||
Key='original_key.txt',
|
||||
ExtraArgs={
|
||||
'ACL': 'public-read',
|
||||
'ContentType': 'image/png'
|
||||
}
|
||||
Bucket="test-bucket",
|
||||
Key="original_key.txt",
|
||||
ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
|
||||
)
|
||||
|
||||
def test_file_in_folder_exists(self, mocker):
|
||||
mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
|
||||
assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
|
||||
mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]})
|
||||
assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@@ -8,6 +7,7 @@ from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.local_storage import LocalStorage
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def local_storage(setup_module, tmp_path) -> LocalStorage:
|
||||
save_to = tmp_path / "local_archive"
|
||||
@@ -20,6 +20,7 @@ def local_storage(setup_module, tmp_path) -> LocalStorage:
|
||||
}
|
||||
return setup_module("local_storage", configs)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
@@ -27,9 +28,11 @@ def sample_media(tmp_path) -> Media:
|
||||
src_file.write_text("test content")
|
||||
return Media(filename=str(src_file))
|
||||
|
||||
|
||||
def test_too_long_save_path(setup_module):
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("local_storage", {"save_to": "long"*100})
|
||||
setup_module("local_storage", {"save_to": "long" * 100})
|
||||
|
||||
|
||||
def test_get_cdn_url_relative(local_storage):
|
||||
local_storage.filename_generator = "random"
|
||||
@@ -38,6 +41,7 @@ def test_get_cdn_url_relative(local_storage):
|
||||
expected = os.path.join(local_storage.save_to, media.key)
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_get_cdn_url_absolute(local_storage):
|
||||
local_storage.filename_generator = "random"
|
||||
|
||||
@@ -47,14 +51,14 @@ def test_get_cdn_url_absolute(local_storage):
|
||||
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
def test_upload_file_contents_and_metadata(local_storage, sample_media):
|
||||
local_storage.store(sample_media, "https://example.com", Metadata())
|
||||
dest = os.path.join(local_storage.save_to, sample_media.key)
|
||||
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
|
||||
|
||||
|
||||
def test_upload_nonexistent_source(local_storage):
|
||||
media = Media(_key="missing.txt", filename="nonexistent.txt")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
local_storage.upload(media)
|
||||
|
||||
|
||||
|
||||
@@ -6,32 +6,28 @@ from auto_archiver.core.metadata import Metadata, Media
|
||||
from auto_archiver.core.storage import Storage
|
||||
from auto_archiver.core.module import ModuleFactory
|
||||
|
||||
class TestStorageBase(object):
|
||||
|
||||
class TestStorageBase(object):
|
||||
module_name: str = None
|
||||
config: dict = None
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_storage(self, setup_module):
|
||||
assert (
|
||||
self.module_name is not None
|
||||
), "self.module_name must be set on the subclass"
|
||||
assert self.module_name is not None, "self.module_name must be set on the subclass"
|
||||
assert self.config is not None, "self.config must be a dict set on the subclass"
|
||||
self.storage: Type[Storage] = setup_module(
|
||||
self.module_name, self.config
|
||||
)
|
||||
self.storage: Type[Storage] = setup_module(self.module_name, self.config)
|
||||
|
||||
|
||||
class TestBaseStorage(Storage):
|
||||
|
||||
name = "test_storage"
|
||||
|
||||
def get_cdn_url(self, media):
|
||||
return "cdn_url"
|
||||
|
||||
|
||||
def uploadf(self, file, key, **kwargs):
|
||||
return True
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_file(tmp_path):
|
||||
# create dummy.txt file
|
||||
@@ -39,16 +35,18 @@ def dummy_file(tmp_path):
|
||||
dummy_file.write_text("test content")
|
||||
return str(dummy_file)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def storage_base():
|
||||
def _storage_base(config):
|
||||
storage_base = TestBaseStorage()
|
||||
storage_base.config_setup({TestBaseStorage.name : config})
|
||||
storage_base.config_setup({TestBaseStorage.name: config})
|
||||
storage_base.module_factory = ModuleFactory()
|
||||
return storage_base
|
||||
|
||||
|
||||
return _storage_base
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"path_generator, filename_generator, url, expected_key",
|
||||
[
|
||||
@@ -58,11 +56,11 @@ def storage_base():
|
||||
("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
|
||||
("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
|
||||
("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
|
||||
|
||||
],
|
||||
)
|
||||
def test_storage_name_generation(storage_base, path_generator, filename_generator, url,
|
||||
expected_key, mocker, tmp_path, dummy_file):
|
||||
def test_storage_name_generation(
|
||||
storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file
|
||||
):
|
||||
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
|
||||
mock_random.return_value = "pretend-random"
|
||||
|
||||
@@ -89,10 +87,10 @@ def test_really_long_name(storage_base, dummy_file):
|
||||
}
|
||||
storage: Storage = storage_base(config)
|
||||
|
||||
url = f"https://example.com/{'file'*100}"
|
||||
url = f"https://example.com/{'file' * 100}"
|
||||
media = Media(filename=dummy_file)
|
||||
storage.set_key(media, url, Metadata())
|
||||
assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"
|
||||
assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt"
|
||||
|
||||
|
||||
def test_storage_loads_hash_enricher(storage_base, dummy_file):
|
||||
|
||||
Reference in New Issue
Block a user