From e76551ba227b1666a68b266f57628bcdae8a9e51 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Thu, 13 Mar 2025 13:21:32 +0000 Subject: [PATCH] Add documentation, pre-commit hook, more make commands and --- .pre-commit-config.yaml | 7 +- Makefile | 36 ++- .../development/developer_guidelines.md | 1 + docs/source/development/style_guide.md | 39 +++ poetry.lock | 129 +++++++- pyproject.toml | 17 +- src/auto_archiver/core/consts.py | 28 +- src/auto_archiver/core/orchestrator.py | 288 ++++++++++++------ src/auto_archiver/core/storage.py | 21 +- .../modules/gsheet_feeder_db/__manifest__.py | 4 +- .../modules/local_storage/local_storage.py | 18 +- .../modules/s3_storage/s3_storage.py | 37 +-- .../tiktok_tikwm_extractor/__init__.py | 2 +- .../tiktok_tikwm_extractor/__manifest__.py | 7 +- .../tiktok_tikwm_extractor.py | 7 +- .../wacz_extractor_enricher/__manifest__.py | 4 +- tests/conftest.py | 4 +- .../extractors/test_tiktok_tikwm_extractor.py | 85 +++--- tests/storages/test_S3_storage.py | 52 ++-- tests/storages/test_local_storage.py | 12 +- tests/storages/test_storage_base.py | 30 +- 21 files changed, 558 insertions(+), 270 deletions(-) create mode 100644 docs/source/development/style_guide.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0fdf695..78421d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,10 @@ +# Run Ruff formatter on commits. repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.9.10 hooks: - - id: ruff -# args: [ --fix ] - id: ruff-format + + # Runs Ruff linting - just checks without fixing, but blocks commit if errors are found. +# - id: ruff +# args: ["--output-format=concise"] \ No newline at end of file diff --git a/Makefile b/Makefile index 72f2058..c59f272 100644 --- a/Makefile +++ b/Makefile @@ -9,34 +9,54 @@ help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @echo "Additional Commands:" @echo " make test - Run all tests in 'tests/' with pytest" - @echo " make lint - Run ruff linter and auto-fix issues" + @echo " make ruff-check - Run Ruff linting and formatting checks (safe)" + @echo " make ruff-clean - Auto-fix Ruff linting and formatting issues" @echo " make docs - Generate documentation (same as 'make html')" - @echo " make clean_docs - Remove generated docs" + @echo " make clean-docs - Remove generated docs" @echo " make docker-run - Run the Docker container" + @echo " make show-docs - Build and open the documentation in a browser" + + .PHONY: test test: @echo "Running tests..." @pytest tests --disable-warnings -.PHONY: lint -lint: - @echo "Linting with ruff..." - @ruff check --fix . + +.PHONY: ruff-check +ruff-check: + @echo "Checking code style with Ruff (safe)..." + @ruff check . + + +.PHONY: ruff-clean +ruff-clean: + @echo "Fixing lint and formatting issues with Ruff..." + @ruff check . --fix + @ruff format . + .PHONY: docs docs: @echo "Building documentation..." @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" -.PHONY: clean_docs -clean_docs: + +.PHONY: clean-docs +clean-docs: @echo "Cleaning up generated documentation files..." @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @rm -rf "$(SOURCEDIR)/autoapi/" "$(SOURCEDIR)/modules/autogen/" @echo "Cleanup complete." +.PHONY: show-docs +show-docs: + @echo "Opening documentation in browser..." + @open "$(BUILDDIR)/html/index.html" + + # Run Docker with default settings .PHONY: docker-run docker-run: diff --git a/docs/source/development/developer_guidelines.md b/docs/source/development/developer_guidelines.md index 0014d8f..dd94c57 100644 --- a/docs/source/development/developer_guidelines.md +++ b/docs/source/development/developer_guidelines.md @@ -32,4 +32,5 @@ testing docs release settings_page +style_guide ``` \ No newline at end of file diff --git a/docs/source/development/style_guide.md b/docs/source/development/style_guide.md new file mode 100644 index 0000000..a73a6fc --- /dev/null +++ b/docs/source/development/style_guide.md @@ -0,0 +1,39 @@ +### Style Guide + +The project uses [ruff](https://docs.astral.sh/ruff/) for linting and formatting. +Our style configurations are set in the `pyproject.toml` file. + +We have a pre-commit hook to run the formatter before you commit, but Ruff can also be [integrated with most editors](https://docs.astral.sh/ruff/editors/setup/) to run automatically. + +We recommend you also run the linter before pushing code. + +# Running the linter + +We have Makefile commands to run common tasks (Note if you're on Windows you might need to install `make` first, or you can use ruff directly): + +This outputs a report of any issues found: +```shell +make ruff-check +``` + +This command will attempt to fix any issues it can: + +⚠️ Warning: This can cause breaking changes. ⚠️ + +Ensure you check any modifications by this before committing them. +```shell +make ruff-fix +``` + +**Note:** If you're on Windows you might not have `make` installed by default. +This is included with [Git for Windows](https://gitforwindows.org/) or you can install make via [Chocolatey](https://chocolatey.org/): +```shell +choco install make +``` + +**Running directly with ruff** + +Alternatively, you can run the commands directly with ruff. + +Our rules are quite lenient for general usage, but if you want to explore more rigorous checks you can explore the [ruff documentation](https://docs.astral.sh/ruff/configuration/). +You can then run checks to see more nuanced errors which you can review manually. \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index b679542..d385bc2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -481,6 +481,18 @@ files = [ [package.dependencies] pycparser = "*" +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.4.1" @@ -696,6 +708,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"] fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"] langdetect = ["langdetect (>=1.0.0)"] +[[package]] +name = "distlib" +version = "0.3.9" +description = "Distribution utilities" +optional = false +python-versions = "*" +groups = ["dev"] +files = [ + {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"}, + {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, +] + [[package]] name = "docutils" version = "0.21.2" @@ -742,6 +766,23 @@ future = "*" [package.extras] dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] +[[package]] +name = "filelock" +version = "3.17.0" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, + {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] +typing = ["typing-extensions (>=4.12.2)"] + [[package]] name = "future" version = "1.0.0" @@ -919,6 +960,21 @@ files = [ [package.dependencies] pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} +[[package]] +name = "identify" +version = "2.6.9" +description = "File identification library for Python" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "identify-2.6.9-py2.py3-none-any.whl", hash = "sha256:c98b4322da415a8e5a70ff6e51fbc2d2932c015532d77e9f8537b4ba7813b150"}, + {file = "identify-2.6.9.tar.gz", hash = "sha256:d40dfe3142a1421d8518e3d3985ef5ac42890683e32306ad614a29490abeb6bf"}, +] + +[package.extras] +license = ["ukkonen"] + [[package]] name = "idna" version = "3.10" @@ -1260,6 +1316,18 @@ rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-bo testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"] testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"] +[[package]] +name = "nodeenv" +version = "1.9.1" +description = "Node.js virtual environment builder" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +files = [ + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, +] + [[package]] name = "numpy" version = "2.1.3" @@ -1513,6 +1581,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole typing = ["typing-extensions"] xmp = ["defusedxml"] +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + [[package]] name = "pluggy" version = "1.5.0" @@ -1529,6 +1614,25 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pre-commit" +version = "4.1.0" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"}, + {file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + [[package]] name = "proto-plus" version = "1.26.0" @@ -1902,7 +2006,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["dev", "docs"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -2911,6 +3015,27 @@ typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} [package.extras] standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +[[package]] +name = "virtualenv" +version = "20.29.3" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "virtualenv-20.29.3-py3-none-any.whl", hash = "sha256:3e3d00f5807e83b234dfb6122bf37cfadf4be216c53a49ac059d02414f819170"}, + {file = "virtualenv-20.29.3.tar.gz", hash = "sha256:95e39403fcf3940ac45bc717597dba16110b74506131845d9b687d5e73d947ac"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<5" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] + [[package]] name = "vk-api" version = "11.9.9" @@ -3213,4 +3338,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "0feae518c3a51717bd80e90eea3cd3ed53925af656f00b662c856bae38a742bb" +content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901" diff --git a/pyproject.toml b/pyproject.toml index 90ac56f..2defdb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ autopep8 = "^2.3.1" pytest-loguru = "^0.4.0" pytest-mock = "^3.14.0" ruff = "^0.9.10" +pre-commit = "^4.1.0" [tool.poetry.group.docs.dependencies] sphinx = "^8.1.3" @@ -96,23 +97,23 @@ markers = [ #exclude = ["docs"] line-length = 120 # Remove this for a more detailed lint report -#output-format = "concise" +output-format = "concise" [tool.ruff.lint] -#add bugbear? -# I : isort -# UP : upgrade, e.g. use fstrings -# ANN : annotations -extend-select = ["B"] +# Extend the rules to check for by adding them to this option: +# See documentation for more details: https://docs.astral.sh/ruff/rules/ +#extend-select = ["B"] # Ignore unused imports as some are currently required for lazy loading -# This can be removed for a `lint check` run which is manually reviewed -ignore = ["F401"] +# This can be removed for a `ruff check` run which is manually reviewed +#ignore = ["F401"] [tool.ruff.lint.per-file-ignores] # Ignore import violations in __init__.py files "__init__.py" = ["F401", "F403"] +# Ignore 'useless expression' in manifest files. +"__manifest__.py" = ["B018"] [tool.ruff.format] docstring-code-format = false diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py index 9a5e1e3..3b99496 100644 --- a/src/auto_archiver/core/consts.py +++ b/src/auto_archiver/core/consts.py @@ -1,25 +1,19 @@ class SetupError(ValueError): pass -MODULE_TYPES = [ - 'feeder', - 'extractor', - 'enricher', - 'database', - 'storage', - 'formatter' -] + +MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"] MANIFEST_FILE = "__manifest__.py" DEFAULT_MANIFEST = { - 'name': '', # the display name of the module - 'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name! - 'type': [], # the type of the module, can be one or more of MODULE_TYPES - 'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software - 'description': '', # a description of the module - 'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format - 'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName - 'version': '1.0', # the version of the module - 'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line + "name": "", # the display name of the module + "author": "Bellingcat", # creator of the module, leave this as Bellingcat or set your own name! + "type": [], # the type of the module, can be one or more of MODULE_TYPES + "requires_setup": True, # whether or not this module requires additional setup such as setting API Keys or installing additional software + "description": "", # a description of the module + "dependencies": {}, # external dependencies, e.g. python packages or binaries, in dictionary format + "entry_point": "", # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName + "version": "1.0", # the version of the module + "configs": {}, # any configuration options this module has, these will be exposed to the user in the config file or via the command line } diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index dca2f4a..8c7d112 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -1,6 +1,6 @@ -""" Orchestrates all archiving steps, including feeding items, - archiving them with specific archivers, enrichment, storage, - formatting, database operations and clean up. +"""Orchestrates all archiving steps, including feeding items, +archiving them with specific archivers, enrichment, storage, +formatting, database operations and clean up. """ @@ -19,8 +19,17 @@ import requests from .metadata import Metadata, Media from auto_archiver.version import __version__ -from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \ - DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE +from .config import ( + read_yaml, + store_yaml, + to_dot_notation, + merge_dicts, + is_valid_config, + DefaultValidatingParser, + UniqueAppendAction, + AuthenticationJsonParseAction, + DEFAULT_CONFIG_FILE, +) from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from .consts import MODULE_TYPES, SetupError @@ -30,8 +39,8 @@ if TYPE_CHECKING: from .base_module import BaseModule from .module import LazyBaseModule -class ArchivingOrchestrator: +class ArchivingOrchestrator: # instance variables module_factory: ModuleFactory setup_finished: bool @@ -61,30 +70,63 @@ class ArchivingOrchestrator: epilog="Check the code at https://github.com/bellingcat/auto-archiver", formatter_class=RichHelpFormatter, ) - parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit') - parser.add_argument('--version', action='version', version=__version__) - parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) - parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') + parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit") + parser.add_argument("--version", action="version", version=__version__) + parser.add_argument( + "--config", + action="store", + dest="config_file", + help="the filename of the YAML configuration file (defaults to 'config.yaml')", + default=DEFAULT_CONFIG_FILE, + ) + parser.add_argument( + "--mode", + action="store", + dest="mode", + type=str, + choices=["simple", "full"], + help="the mode to run the archiver in", + default="simple", + ) # override the default 'help' so we can inject all the configs and show those - parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction) - parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction) + parser.add_argument( + "-s", + "--store", + dest="store", + default=False, + help="Store the created config in the config file", + action=argparse.BooleanOptionalAction, + ) + parser.add_argument( + "--module_paths", + dest="module_paths", + nargs="+", + default=[], + help="additional paths to search for modules", + action=UniqueAppendAction, + ) self.basic_parser = parser return parser - + def check_steps(self, config): for module_type in MODULE_TYPES: - if not config['steps'].get(f"{module_type}s", []): - if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"): - raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \ -Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n") - if module_type == 'extractor' and config['steps'].get('archivers'): - raise SetupError("As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \ -Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n") - raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)") + if not config["steps"].get(f"{module_type}s", []): + if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"): + raise SetupError( + f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \ +Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n" + ) + if module_type == "extractor" and config["steps"].get("archivers"): + raise SetupError( + "As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \ +Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_here]\n enrichers:...\n" + ) + raise SetupError( + f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" + ) def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None: - # modules parser to get the overridden 'steps' values modules_parser = argparse.ArgumentParser( add_help=False, @@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.add_modules_args(modules_parser) cli_modules, unused_args = modules_parser.parse_known_args(unused_args) for module_type in MODULE_TYPES: - yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) + yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[ + "steps" + ].get(f"{module_type}s", []) parser = DefaultValidatingParser( add_help=False, @@ -115,30 +159,32 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ enabled_modules = [] # first loads the modules from the config file, then from the command line for module_type in MODULE_TYPES: - enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) + enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", [])) # clear out duplicates, but keep the order enabled_modules = list(dict.fromkeys(enabled_modules)) - avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True) + avail_modules = self.module_factory.available_modules( + limit_to_modules=enabled_modules, suppress_warnings=True + ) self.add_individual_module_args(avail_modules, parser) - elif basic_config.mode == 'simple': + elif basic_config.mode == "simple": simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup] self.add_individual_module_args(simple_modules, parser) # add them to the config for module in simple_modules: for module_type in module.type: - yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) + yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name) else: # load all modules, they're not using the 'simple' mode all_modules = self.module_factory.available_modules() # add all the modules to the steps for module in all_modules: for module_type in module.type: - yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) + yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name) self.add_individual_module_args(all_modules, parser) - + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -164,43 +210,76 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ store_yaml(config, basic_config.config_file) return config - + def add_modules_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser # Module loading from the command line for module_type in MODULE_TYPES: - parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) + parser.add_argument( + f"--{module_type}s", + dest=f"{module_type}s", + nargs="+", + help=f"the {module_type}s to use", + default=[], + action=UniqueAppendAction, + ) def add_additional_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser - parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \ + parser.add_argument( + "--authentication", + dest="authentication", + help="A dictionary of sites and their authentication methods \ (token, username etc.) that extractors can use to log into \ a website. If passing this on the command line, use a JSON string. \ - You may also pass a path to a valid JSON/YAML file which will be parsed.', - default={}, - nargs="?", - action=AuthenticationJsonParseAction) + You may also pass a path to a valid JSON/YAML file which will be parsed.", + default={}, + nargs="?", + action=AuthenticationJsonParseAction, + ) # logging arguments - parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper) - parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) - parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) - - def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: + parser.add_argument( + "--logging.level", + action="store", + dest="logging.level", + choices=["INFO", "DEBUG", "ERROR", "WARNING"], + help="the logging level to use", + default="INFO", + type=str.upper, + ) + parser.add_argument( + "--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None + ) + parser.add_argument( + "--logging.rotation", + action="store", + dest="logging.rotation", + help="the logging rotation to use", + default=None, + ) + def add_individual_module_args( + self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None + ) -> None: if not modules: modules = self.module_factory.available_modules() - + for module in modules: - if module.name == 'cli_feeder': + if module.name == "cli_feeder": # special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls= - parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml') + parser.add_argument( + "urls", + nargs="*", + default=[], + help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", + ) continue - + if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) @@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") for name, kwargs in module.configs.items(): - if not kwargs.get('metavar', None): + if not kwargs.get("metavar", None): # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR] - kwargs['metavar'] = name.upper() + kwargs["metavar"] = name.upper() - if kwargs.get('required', False): + if kwargs.get("required", False): # required args shouldn't have a 'default' value, remove it - kwargs.pop('default', None) + kwargs.pop("default", None) - kwargs.pop('cli_set', None) - should_store = kwargs.pop('should_store', False) - kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" + kwargs.pop("cli_set", None) + should_store = kwargs.pop("should_store", False) + kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}" try: - kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__')) + kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__")) except AttributeError: - kwargs['type'] = __builtins__.get(kwargs.get('type'), str) + kwargs["type"] = __builtins__.get(kwargs.get("type"), str) arg = group.add_argument(f"--{module.name}.{name}", **kwargs) arg.should_store = should_store @@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.basic_parser.exit() def setup_logging(self, config): + logging_config = config["logging"] - logging_config = config['logging'] - - if logging_config.get('enabled', True) is False: + if logging_config.get("enabled", True) is False: # disabled logging settings, they're set on a higher level - logger.disable('auto_archiver') + logger.disable("auto_archiver") return # setup loguru logging @@ -253,38 +331,45 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ pass # add other logging info - if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 - self.logger_id = logger.add(sys.stderr, level=logging_config['level']) - if log_file := logging_config['file']: - logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) + if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0 + self.logger_id = logger.add(sys.stderr, level=logging_config["level"]) + if log_file := logging_config["file"]: + logger.add(log_file) if not logging_config["rotation"] else logger.add( + log_file, rotation=logging_config["rotation"] + ) def install_modules(self, modules_by_type): """ - Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the + Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type are loaded, the program will exit with an error message. """ invalid_modules = [] for module_type in MODULE_TYPES: - step_items = [] modules_to_load = modules_by_type[f"{module_type}s"] if not modules_to_load: - raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)") + raise SetupError( + f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)" + ) def check_steps_ok(): if not len(step_items): if len(modules_to_load): - logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}") - raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.") - + logger.error( + f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}" + ) + raise SetupError( + f"NO {module_type.upper()}S LOADED. Please check your configuration and try again." + ) - if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: - raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") + if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1: + raise SetupError( + f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}" + ) for module in modules_to_load: - if module in invalid_modules: continue @@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ loaded_module: BaseModule = self.module_factory.get_module(module, self.config) except (KeyboardInterrupt, Exception) as e: logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") - if loaded_module and module_type == 'extractor': + if loaded_module and module_type == "extractor": loaded_module.cleanup() raise e @@ -308,11 +393,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ def load_config(self, config_file: str) -> dict: if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + logger.error( + f"The configuration file {config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings." + ) raise FileNotFoundError(f"Configuration file {config_file} not found") return read_yaml(config_file) - + def setup_config(self, args: list) -> dict: """ Sets up the configuration file, merging the default config with the user's config @@ -335,13 +422,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ yaml_config = self.load_config(basic_config.config_file) return self.setup_complete_parser(basic_config, yaml_config, unused_args) - + def check_for_updates(self): response = requests.get("https://pypi.org/pypi/auto-archiver/json").json() - latest_version = response['info']['version'] + latest_version = response["info"]["version"] # check version compared to current version if latest_version != __version__: - if os.environ.get('RUNNING_IN_DOCKER'): + if os.environ.get("RUNNING_IN_DOCKER"): update_cmd = "`docker pull bellingcat/auto-archiver:latest`" else: update_cmd = "`pip install --upgrade auto-archiver`" @@ -351,33 +438,36 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ logger.warning(f"Make sure to update to the latest version using: {update_cmd}") logger.warning("") - def setup(self, args: list): """ Function to configure all setup of the orchestrator: setup configs and load modules. - + This method should only ever be called once """ self.check_for_updates() if self.setup_finished: - logger.warning("The `setup_config()` function should only ever be run once. \ + logger.warning( + "The `setup_config()` function should only ever be run once. \ If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \ For code implementatations, you should call .setup_config() once then you may call .feed() \ - multiple times to archive multiple URLs.") + multiple times to archive multiple URLs." + ) return self.setup_basic_parser() self.config = self.setup_config(args) logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") - self.install_modules(self.config['steps']) + self.install_modules(self.config["steps"]) # log out the modules that were loaded for module_type in MODULE_TYPES: - logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) - + logger.info( + f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")) + ) + self.setup_finished = True def _command_line_run(self, args: list) -> Generator[Metadata]: @@ -385,9 +475,9 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ This is the main entry point for the orchestrator, when run from the command line. :param args: list of arguments to pass to the orchestrator - these are the command line args - + You should not call this method from code implementations. - + This method sets up the configuration, loads the modules, and runs the feed. If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately. To test configurations, without loading any modules you can also first call 'setup_configs' @@ -396,7 +486,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.setup(args) return self.feed() except Exception as e: - logger.error(e) + logger.error(e, exc_info=True) exit(1) def cleanup(self) -> None: @@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ e.cleanup() def feed(self) -> Generator[Metadata]: - url_count = 0 for feeder in self.feeders: for item in feeder: @@ -436,7 +525,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ self.cleanup() exit() except Exception as e: - logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') + logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}") for d in self.databases: if isinstance(e, AssertionError): d.failed(item, str(e)) @@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ def archive(self, result: Metadata) -> Union[Metadata, None]: """ - Runs the archiving process for a single URL - 1. Each archiver can sanitize its own URLs - 2. Check for cached results in Databases, and signal start to the databases - 3. Call Archivers until one succeeds - 4. Call Enrichers - 5. Store all downloaded/generated media - 6. Call selected Formatter and store formatted if needed + Runs the archiving process for a single URL + 1. Each archiver can sanitize its own URLs + 2. Check for cached results in Databases, and signal start to the databases + 3. Call Archivers until one succeeds + 4. Call Enrichers + 5. Store all downloaded/generated media + 6. Call selected Formatter and store formatted if needed """ original_url = result.get_url().strip() @@ -528,7 +617,6 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return result - def setup_authentication(self, config: dict) -> dict: """ @@ -537,7 +625,7 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ Split up strings into multiple sites if they are comma separated """ - authentication = config.get('authentication', {}) + authentication = config.get("authentication", {}) # extract out concatenated sites for key, val in copy(authentication).items(): @@ -546,8 +634,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_ site = site.strip() authentication[site] = val del authentication[key] - - config['authentication'] = authentication + + config["authentication"] = authentication return config # Helper Properties diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index c73e29c..3205f5a 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -32,16 +32,16 @@ from auto_archiver.utils.misc import random_str from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher + class Storage(BaseModule): - """ Base class for implementing storage modules in the media archiving framework. Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior. """ - def store(self, media: Media, url: str, metadata: Metadata=None) -> None: - if media.is_stored(in_storage=self): + def store(self, media: Media, url: str, metadata: Metadata = None) -> None: + if media.is_stored(in_storage=self): logger.debug(f"{media.key} already stored, skipping") return @@ -73,18 +73,18 @@ class Storage(BaseModule): This method should not be called directly, but instead be called through the 'store' method, which sets up the media for storage. """ - logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') - with open(media.filename, 'rb') as f: + logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}") + with open(media.filename, "rb") as f: return self.uploadf(f, media, **kwargs) def set_key(self, media: Media, url: str, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" - + if media.key is not None and len(media.key) > 0: # media key is already set return - folder = metadata.get_context('folder', '') + folder = metadata.get_context("folder", "") filename, ext = os.path.splitext(media.filename) # Handle path_generator logic @@ -104,12 +104,11 @@ class Storage(BaseModule): filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module - he = self.module_factory.get_module("hash_enricher", self.config) + he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: raise ValueError(f"Invalid filename_generator: {filename_generator}") - - key = os.path.join(folder, path, f"{filename}{ext}") - media._key = key \ No newline at end of file + key = os.path.join(folder, path, f"{filename}{ext}") + media._key = key diff --git a/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py index 6547233..5143218 100644 --- a/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder_db/__manifest__.py @@ -12,9 +12,7 @@ "default": None, "help": "the id of the sheet to archive (alternative to 'sheet' config)", }, - "header": {"default": 1, - "help": "index of the header row (starts at 1)", - "type": "int"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"}, "service_account": { "default": "secrets/service_account.json", "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html", diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 54f4a0e..fdc6978 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -1,4 +1,3 @@ - import shutil from typing import IO import os @@ -8,12 +7,13 @@ from auto_archiver.core import Media from auto_archiver.core import Storage from auto_archiver.core.consts import SetupError + class LocalStorage(Storage): - - def setup(self) -> None: if len(self.save_to) > 200: - raise SetupError("Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.") + raise SetupError( + "Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path." + ) def get_cdn_url(self, media: Media) -> str: dest = media.key @@ -25,18 +25,18 @@ class LocalStorage(Storage): def set_key(self, media, url, metadata): # clarify we want to save the file to the save_to folder - old_folder = metadata.get('folder', '') - metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', ''))) + old_folder = metadata.get("folder", "") + metadata.set_context("folder", os.path.join(self.save_to, metadata.get("folder", ""))) super().set_key(media, url, metadata) # don't impact other storages that might want a different 'folder' set - metadata.set_context('folder', old_folder) + metadata.set_context("folder", old_folder) def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata dest = media.key os.makedirs(os.path.dirname(dest), exist_ok=True) - logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') + logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}") res = shutil.copy2(media.filename, dest) logger.info(res) @@ -44,4 +44,4 @@ class LocalStorage(Storage): # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: - pass \ No newline at end of file + pass diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index bb87812..abac4f7 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -1,4 +1,3 @@ - from typing import IO import boto3 @@ -11,18 +10,20 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage): +class S3Storage(Storage): def setup(self) -> None: self.s3 = boto3.client( - 's3', + "s3", region_name=self.region, endpoint_url=self.endpoint_url.format(region=self.region), aws_access_key_id=self.key, - aws_secret_access_key=self.secret + aws_secret_access_key=self.secret, ) if self.random_no_duplicate: - logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.") + logger.warning( + "random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`." + ) def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) @@ -32,13 +33,13 @@ class S3Storage(Storage): return True extra_args = kwargs.get("extra_args", {}) - if not self.private and 'ACL' not in extra_args: - extra_args['ACL'] = 'public-read' + if not self.private and "ACL" not in extra_args: + extra_args["ACL"] = "public-read" - if 'ContentType' not in extra_args: + if "ContentType" not in extra_args: try: if media.mimetype: - extra_args['ContentType'] = media.mimetype + extra_args["ContentType"] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) @@ -50,21 +51,21 @@ class S3Storage(Storage): hd = calculate_file_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) - if existing_key:=self.file_in_folder(path): + if existing_key := self.file_in_folder(path): media._key = existing_key media.set("previously archived", True) logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") return False - + _, ext = os.path.splitext(media.key) media._key = os.path.join(path, f"{random_str(24)}{ext}") return True - def file_in_folder(self, path:str) -> str: + def file_in_folder(self, path: str) -> str: # checks if path exists and is not an empty folder - if not path.endswith('/'): - path = path + '/' - resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1) - if 'Contents' in resp: - return resp['Contents'][0]['Key'] - return False \ No newline at end of file + if not path.endswith("/"): + path = path + "/" + resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1) + if "Contents" in resp: + return resp["Contents"][0]["Key"] + return False diff --git a/src/auto_archiver/modules/tiktok_tikwm_extractor/__init__.py b/src/auto_archiver/modules/tiktok_tikwm_extractor/__init__.py index 25a20f5..e1008ad 100644 --- a/src/auto_archiver/modules/tiktok_tikwm_extractor/__init__.py +++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/__init__.py @@ -1 +1 @@ -from .tiktok_tikwm_extractor import TiktokTikwmExtractor \ No newline at end of file +from .tiktok_tikwm_extractor import TiktokTikwmExtractor diff --git a/src/auto_archiver/modules/tiktok_tikwm_extractor/__manifest__.py b/src/auto_archiver/modules/tiktok_tikwm_extractor/__manifest__.py index 56d8e3e..7c46a87 100644 --- a/src/auto_archiver/modules/tiktok_tikwm_extractor/__manifest__.py +++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/__manifest__.py @@ -2,10 +2,7 @@ "name": "Tiktok Tikwm Extractor", "type": ["extractor"], "requires_setup": False, - "dependencies": { - "python": ["loguru", "requests"], - "bin": [] - }, + "dependencies": {"python": ["loguru", "requests"], "bin": []}, "description": """ Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/ @@ -19,5 +16,5 @@ - If tikwm.com is down, this extractor will not work. - If tikwm.com changes their API, this extractor may break. - If no video is found, this extractor will consider the extraction failed. - """ + """, } diff --git a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py index e7ed91a..3264199 100644 --- a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py +++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py @@ -12,11 +12,12 @@ class TiktokTikwmExtractor(Extractor): """ Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content. """ + TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}" def download(self, item: Metadata) -> bool | Metadata: url = item.get_url() - + if not re.match(TikTokIE._VALID_URL, url): return False @@ -33,7 +34,7 @@ class TiktokTikwmExtractor(Extractor): logger.error(f"failed to parse JSON response from tikwm.com for {url=}") return False - if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})): + if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})): logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}") return False @@ -67,7 +68,7 @@ class TiktokTikwmExtractor(Extractor): if created_at := api_data.pop("create_time", None): result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc)) - if (author := api_data.pop("author", None)): + if author := api_data.pop("author", None): result.set("author", author) result.set("api_data", api_data) diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py index 7916049..97e3bf6 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py @@ -14,9 +14,7 @@ "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).", }, "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"}, - "timeout": {"default": 120, - "help": "timeout for WACZ generation in seconds", - "type": "int"}, + "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"}, "extract_media": { "default": False, "type": "bool", diff --git a/tests/conftest.py b/tests/conftest.py index ba1f652..9754b91 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,9 @@ TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"] @pytest.fixture def setup_module(request): - def _setup_module(module_name, config={}): + def _setup_module(module_name, config=None): + if config is None: + config = {} module_factory = ModuleFactory() if isinstance(module_name, type): diff --git a/tests/extractors/test_tiktok_tikwm_extractor.py b/tests/extractors/test_tiktok_tikwm_extractor.py index f675ac0..690d448 100644 --- a/tests/extractors/test_tiktok_tikwm_extractor.py +++ b/tests/extractors/test_tiktok_tikwm_extractor.py @@ -24,17 +24,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger") return mock_get, mock_logger - @pytest.mark.parametrize("url,valid_url", [ - ("https://bellingcat.com", False), - ("https://youtube.com", False), - ("https://tiktok.co/", False), - ("https://tiktok.com/", False), - ("https://www.tiktok.com/", False), - ("https://api.cool.tiktok.com/", False), - (VALID_EXAMPLE_URL, True), - ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True), - ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True), - ]) + @pytest.mark.parametrize( + "url,valid_url", + [ + ("https://bellingcat.com", False), + ("https://youtube.com", False), + ("https://tiktok.co/", False), + ("https://tiktok.com/", False), + ("https://www.tiktok.com/", False), + ("https://api.cool.tiktok.com/", False), + (VALID_EXAMPLE_URL, True), + ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True), + ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True), + ], + ) def test_valid_urls(self, mocker, make_item, url, valid_url): mock_get, mock_logger = self.get_mockers(mocker) if valid_url: @@ -53,17 +56,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_logger.error.assert_called_once() assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response") - mock_get.return_value.json.side_effect = Exception - with pytest.raises(Exception): + mock_get.return_value.json.side_effect = ValueError + with pytest.raises(ValueError): self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) mock_get.assert_called() assert mock_get.call_count == 2 assert mock_get.return_value.json.call_count == 2 - @pytest.mark.parametrize("response", [ - ({"msg": "failure"}), - ({"msg": "success"}), - ]) + @pytest.mark.parametrize( + "response", + [ + ({"msg": "failure"}), + ({"msg": "success"}), + ], + ) def test_unsuccessful_responses(self, mocker, make_item, response): mock_get, mock_logger = self.get_mockers(mocker) mock_get.return_value.status_code = 200 @@ -74,11 +80,14 @@ class TestTiktokTikwmExtractor(TestExtractorBase): mock_logger.error.assert_called_once() assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response") - @pytest.mark.parametrize("response,has_vid", [ - ({"data": {"id": 123}}, False), - ({"data": {"wmplay": "url"}}, True), - ({"data": {"play": "url"}}, True), - ]) + @pytest.mark.parametrize( + "response,has_vid", + [ + ({"data": {"id": 123}}, False), + ({"data": {"wmplay": "url"}}, True), + ({"data": {"play": "url"}}, True), + ], + ) def test_correct_extraction(self, mocker, make_item, response, has_vid): mock_get, mock_logger = self.get_mockers(mocker) mock_get.return_value.status_code = 200 @@ -102,16 +111,19 @@ class TestTiktokTikwmExtractor(TestExtractorBase): def test_correct_data_extracted(self, mocker, make_item): mock_get, _ = self.get_mockers(mocker) mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = {"msg": "success", "data": { - "wmplay": "url", - "origin_cover": "cover.jpg", - "title": "Title", - "id": 123, - "duration": 60, - "create_time": 1736301699, - "author": "Author", - "other": "data" - }} + mock_get.return_value.json.return_value = { + "msg": "success", + "data": { + "wmplay": "url", + "origin_cover": "cover.jpg", + "title": "Title", + "id": 123, + "duration": 60, + "create_time": 1736301699, + "author": "Author", + "other": "data", + }, + } result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) assert result.is_success() @@ -129,9 +141,12 @@ class TestTiktokTikwmExtractor(TestExtractorBase): result = self.extractor.download(make_item(url)) assert result.is_success() assert len(result.media) == 2 - assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews " + assert ( + result.get_title() + == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews " + ) assert result.get("author").get("unique_id") == "bbcnews" - assert result.get("api_data").get("id") == '7478038212070411542' + assert result.get("api_data").get("id") == "7478038212070411542" assert result.media[1].get("duration") == 59 assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc) @@ -149,6 +164,6 @@ class TestTiktokTikwmExtractor(TestExtractorBase): assert len(result.media) == 2 assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews" assert result.get("author").get("id") == "7197400619475649562" - assert result.get("api_data").get("id") == '7441821351142362375' + assert result.get("api_data").get("id") == "7441821351142362375" assert result.media[1].get("duration") == 34 assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc) diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index 9e27b3f..87da776 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -8,6 +8,7 @@ class TestS3Storage: """ Test suite for S3Storage. """ + module_name: str = "s3_storage" storage: Type[S3Storage] config: dict = { @@ -32,10 +33,10 @@ class TestS3Storage: """Test that S3 client is initialized with correct parameters""" assert self.storage.s3 is not None - assert self.storage.s3.meta.region_name == 'test-region' + assert self.storage.s3.meta.region_name == "test-region" def test_get_cdn_url_generation(self): - """Test CDN URL formatting """ + """Test CDN URL formatting""" media = Media("test.txt") media._key = "path/to/file.txt" url = self.storage.get_cdn_url(media) @@ -46,14 +47,14 @@ class TestS3Storage: def test_uploadf_sets_acl_public(self, mocker): media = Media("test.txt") mock_file = mocker.MagicMock() - mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') - mocker.patch.object(self.storage, 'is_upload_needed', return_value=True) + mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj") + mocker.patch.object(self.storage, "is_upload_needed", return_value=True) self.storage.uploadf(mock_file, media) mock_s3_upload.assert_called_once_with( mock_file, - Bucket='test-bucket', + Bucket="test-bucket", Key=media.key, - ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'} + ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"}, ) def test_upload_decision_logic(self, mocker): @@ -61,23 +62,29 @@ class TestS3Storage: media = Media("test.txt") assert self.storage.is_upload_needed(media) is True self.storage.random_no_duplicate = True - mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123') - mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt') + mocker.patch( + "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash", + return_value="beepboop123beepboop123beepboop123", + ) + mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt") assert self.storage.is_upload_needed(media) is False - assert media.key == 'existing_key.txt' - mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be') + assert media.key == "existing_key.txt" + mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be") def test_skips_upload_when_duplicate_exists(self, mocker): """Test that upload skips when file_in_folder finds existing object""" self.storage.random_no_duplicate = True - mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") + mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt") media = Media("test.txt") media._key = "original_path.txt" - mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") + mocker.patch( + "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash", + return_value="beepboop123beepboop123beepboop123", + ) assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" assert media.get("previously archived") is True - mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') + mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj") result = self.storage.uploadf(None, media) mock_upload.assert_not_called() assert result is True @@ -85,21 +92,18 @@ class TestS3Storage: def test_uploads_with_correct_parameters(self, mocker): media = Media("test.txt") media._key = "original_key.txt" - mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) - media.mimetype = 'image/png' + mocker.patch.object(S3Storage, "is_upload_needed", return_value=True) + media.mimetype = "image/png" mock_file = mocker.MagicMock() - mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj') + mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj") self.storage.uploadf(mock_file, media) mock_upload.assert_called_once_with( mock_file, - Bucket='test-bucket', - Key='original_key.txt', - ExtraArgs={ - 'ACL': 'public-read', - 'ContentType': 'image/png' - } + Bucket="test-bucket", + Key="original_key.txt", + ExtraArgs={"ACL": "public-read", "ContentType": "image/png"}, ) def test_file_in_folder_exists(self, mocker): - mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]}) - assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt' + mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]}) + assert self.storage.file_in_folder("path/to/") == "path/to/file.txt" diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index c3581df..1230e3d 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -1,4 +1,3 @@ - import os from pathlib import Path @@ -8,6 +7,7 @@ from auto_archiver.core import Media, Metadata from auto_archiver.modules.local_storage import LocalStorage from auto_archiver.core.consts import SetupError + @pytest.fixture def local_storage(setup_module, tmp_path) -> LocalStorage: save_to = tmp_path / "local_archive" @@ -20,6 +20,7 @@ def local_storage(setup_module, tmp_path) -> LocalStorage: } return setup_module("local_storage", configs) + @pytest.fixture def sample_media(tmp_path) -> Media: """Fixture creating a Media object with temporary source file""" @@ -27,9 +28,11 @@ def sample_media(tmp_path) -> Media: src_file.write_text("test content") return Media(filename=str(src_file)) + def test_too_long_save_path(setup_module): with pytest.raises(SetupError): - setup_module("local_storage", {"save_to": "long"*100}) + setup_module("local_storage", {"save_to": "long" * 100}) + def test_get_cdn_url_relative(local_storage): local_storage.filename_generator = "random" @@ -38,6 +41,7 @@ def test_get_cdn_url_relative(local_storage): expected = os.path.join(local_storage.save_to, media.key) assert local_storage.get_cdn_url(media) == expected + def test_get_cdn_url_absolute(local_storage): local_storage.filename_generator = "random" @@ -47,14 +51,14 @@ def test_get_cdn_url_absolute(local_storage): expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) assert local_storage.get_cdn_url(media) == expected + def test_upload_file_contents_and_metadata(local_storage, sample_media): local_storage.store(sample_media, "https://example.com", Metadata()) dest = os.path.join(local_storage.save_to, sample_media.key) assert Path(sample_media.filename).read_text() == Path(dest).read_text() + def test_upload_nonexistent_source(local_storage): media = Media(_key="missing.txt", filename="nonexistent.txt") with pytest.raises(FileNotFoundError): local_storage.upload(media) - - diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 62f2ddc..730304e 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -6,32 +6,28 @@ from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.storage import Storage from auto_archiver.core.module import ModuleFactory -class TestStorageBase(object): +class TestStorageBase(object): module_name: str = None config: dict = None @pytest.fixture(autouse=True) def setup_storage(self, setup_module): - assert ( - self.module_name is not None - ), "self.module_name must be set on the subclass" + assert self.module_name is not None, "self.module_name must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.storage: Type[Storage] = setup_module( - self.module_name, self.config - ) + self.storage: Type[Storage] = setup_module(self.module_name, self.config) class TestBaseStorage(Storage): - name = "test_storage" def get_cdn_url(self, media): return "cdn_url" - + def uploadf(self, file, key, **kwargs): return True + @pytest.fixture def dummy_file(tmp_path): # create dummy.txt file @@ -39,16 +35,18 @@ def dummy_file(tmp_path): dummy_file.write_text("test content") return str(dummy_file) + @pytest.fixture def storage_base(): def _storage_base(config): storage_base = TestBaseStorage() - storage_base.config_setup({TestBaseStorage.name : config}) + storage_base.config_setup({TestBaseStorage.name: config}) storage_base.module_factory = ModuleFactory() return storage_base - + return _storage_base + @pytest.mark.parametrize( "path_generator, filename_generator, url, expected_key", [ @@ -58,11 +56,11 @@ def storage_base(): ("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"), ("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"), ("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"), - ], ) -def test_storage_name_generation(storage_base, path_generator, filename_generator, url, - expected_key, mocker, tmp_path, dummy_file): +def test_storage_name_generation( + storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file +): mock_random = mocker.patch("auto_archiver.core.storage.random_str") mock_random.return_value = "pretend-random" @@ -89,10 +87,10 @@ def test_really_long_name(storage_base, dummy_file): } storage: Storage = storage_base(config) - url = f"https://example.com/{'file'*100}" + url = f"https://example.com/{'file' * 100}" media = Media(filename=dummy_file) storage.set_key(media, url, Metadata()) - assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt" + assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt" def test_storage_loads_hash_enricher(storage_base, dummy_file):