Add documentation, pre-commit hook, more make commands and

2026-06-07 19:08:30 +03:00 · 2025-03-13 13:21:32 +00:00
parent 6e52a534e7
commit e76551ba22
21 changed files with 558 additions and 270 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,10 @@
+# Run Ruff formatter on commits.
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.9.10
    hooks:
-      - id: ruff
-#        args: [ --fix ]
      - id: ruff-format
+
+      # Runs Ruff linting - just checks without fixing, but blocks commit if errors are found.
+#      - id: ruff
+#        args: ["--output-format=concise"]
--- a/36
+++ b/36
@@ -9,34 +9,54 @@ help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	@echo "Additional Commands:"
 	@echo "  make test         - Run all tests in 'tests/' with pytest"
-	@echo "  make lint         - Run ruff linter and auto-fix issues"
+	@echo "  make ruff-check   - Run Ruff linting and formatting checks (safe)"
+	@echo "  make ruff-clean   - Auto-fix Ruff linting and formatting issues"
 	@echo "  make docs         - Generate documentation (same as 'make html')"
-	@echo "  make clean_docs   - Remove generated docs"
+	@echo "  make clean-docs   - Remove generated docs"
 	@echo "  make docker-run   - Run the Docker container"
+	@echo "  make show-docs    - Build and open the documentation in a browser"
+
+

 .PHONY: test
 test:
 	@echo "Running tests..."
 	@pytest tests --disable-warnings

-.PHONY: lint
-lint:
-	@echo "Linting with ruff..."
-	@ruff check --fix .
+
+.PHONY: ruff-check
+ruff-check:
+	@echo "Checking code style with Ruff (safe)..."
+	@ruff check .
+
+
+.PHONY: ruff-clean
+ruff-clean:
+	@echo "Fixing lint and formatting issues with Ruff..."
+	@ruff check . --fix
+	@ruff format .
+

 .PHONY: docs
 docs:
 	@echo "Building documentation..."
 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)"

-.PHONY: clean_docs
-clean_docs:
+
+.PHONY: clean-docs
+clean-docs:
 	@echo "Cleaning up generated documentation files..."
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 	@rm -rf "$(SOURCEDIR)/autoapi/" "$(SOURCEDIR)/modules/autogen/"
 	@echo "Cleanup complete."


+.PHONY: show-docs
+show-docs:
+	@echo "Opening documentation in browser..."
+	@open "$(BUILDDIR)/html/index.html"
+
+
 # Run Docker with default settings
 .PHONY: docker-run
 docker-run:
--- a/docs/source/development/developer_guidelines.md
+++ b/docs/source/development/developer_guidelines.md
@@ -32,4 +32,5 @@ testing
 docs
 release
 settings_page
+style_guide
 ```
--- a/docs/source/development/style_guide.md
+++ b/docs/source/development/style_guide.md
@@ -0,0 +1,39 @@
+### Style Guide
+
+The project uses [ruff](https://docs.astral.sh/ruff/) for linting and  formatting.
+Our style configurations are set in the `pyproject.toml` file.
+
+We have a pre-commit hook to run the formatter before you commit, but Ruff can also be [integrated with most editors](https://docs.astral.sh/ruff/editors/setup/) to run automatically.
+
+We recommend you also run the linter before pushing code. 
+
+# Running the linter
+
+We have Makefile commands to run common tasks (Note if you're on Windows you might need to install `make` first, or you can use ruff directly):
+
+This outputs a report of any issues found:
+```shell
+make ruff-check
+```
+
+This command will attempt to fix any issues it can:
+
+⚠️ Warning: This can cause breaking changes. ⚠️
+
+Ensure you check any modifications by this before committing them.
+```shell
+make ruff-fix
+```
+
+**Note:** If you're on Windows you might not have `make` installed by default.
+This is included with [Git for Windows](https://gitforwindows.org/) or you can install make via [Chocolatey](https://chocolatey.org/):
+```shell
+choco install make
+```
+
+**Running directly with ruff**
+
+Alternatively, you can run the commands directly with ruff.
+
+Our rules are quite lenient for general usage, but if you want to explore more rigorous checks you can explore the [ruff documentation](https://docs.astral.sh/ruff/configuration/).
+You can then run checks to see more nuanced errors which you can review manually.
--- a/poetry.lock
+++ b/poetry.lock
@@ -481,6 +481,18 @@ files = [
 [package.dependencies]
 pycparser = "*"

+[[package]]
+name = "cfgv"
+version = "3.4.0"
+description = "Validate configuration and produce human readable error messages."
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.4.1"
@@ -696,6 +708,18 @@ calendars = ["convertdate (>=2.2.1)", "hijridate"]
 fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.19.3,<2)"]
 langdetect = ["langdetect (>=1.0.0)"]

+[[package]]
+name = "distlib"
+version = "0.3.9"
+description = "Distribution utilities"
+optional = false
+python-versions = "*"
+groups = ["dev"]
+files = [
+    {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"},
+    {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"},
+]
+
 [[package]]
 name = "docutils"
 version = "0.21.2"
@@ -742,6 +766,23 @@ future = "*"
 [package.extras]
 dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]

+[[package]]
+name = "filelock"
+version = "3.17.0"
+description = "A platform independent file lock."
+optional = false
+python-versions = ">=3.9"
+groups = ["dev"]
+files = [
+    {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"},
+    {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"},
+]
+
+[package.extras]
+docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
+typing = ["typing-extensions (>=4.12.2)"]
+
 [[package]]
 name = "future"
 version = "1.0.0"
@@ -919,6 +960,21 @@ files = [
 [package.dependencies]
 pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}

+[[package]]
+name = "identify"
+version = "2.6.9"
+description = "File identification library for Python"
+optional = false
+python-versions = ">=3.9"
+groups = ["dev"]
+files = [
+    {file = "identify-2.6.9-py2.py3-none-any.whl", hash = "sha256:c98b4322da415a8e5a70ff6e51fbc2d2932c015532d77e9f8537b4ba7813b150"},
+    {file = "identify-2.6.9.tar.gz", hash = "sha256:d40dfe3142a1421d8518e3d3985ef5ac42890683e32306ad614a29490abeb6bf"},
+]
+
+[package.extras]
+license = ["ukkonen"]
+
 [[package]]
 name = "idna"
 version = "3.10"
@@ -1260,6 +1316,18 @@ rtd = ["ipython", "sphinx (>=7)", "sphinx-autodoc2 (>=0.5.0,<0.6.0)", "sphinx-bo
 testing = ["beautifulsoup4", "coverage[toml]", "defusedxml", "pygments (<2.19)", "pytest (>=8,<9)", "pytest-cov", "pytest-param-files (>=0.6.0,<0.7.0)", "pytest-regressions", "sphinx-pytest"]
 testing-docutils = ["pygments", "pytest (>=8,<9)", "pytest-param-files (>=0.6.0,<0.7.0)"]

+[[package]]
+name = "nodeenv"
+version = "1.9.1"
+description = "Node.js virtual environment builder"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["dev"]
+files = [
+    {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
+    {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
+]
+
 [[package]]
 name = "numpy"
 version = "2.1.3"
@@ -1513,6 +1581,23 @@ tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "ole
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]

+[[package]]
+name = "platformdirs"
+version = "4.3.6"
+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
+    {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
+]
+
+[package.extras]
+docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"]
+type = ["mypy (>=1.11.2)"]
+
 [[package]]
 name = "pluggy"
 version = "1.5.0"
@@ -1529,6 +1614,25 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]

+[[package]]
+name = "pre-commit"
+version = "4.1.0"
+description = "A framework for managing and maintaining multi-language pre-commit hooks."
+optional = false
+python-versions = ">=3.9"
+groups = ["dev"]
+files = [
+    {file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"},
+    {file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"},
+]
+
+[package.dependencies]
+cfgv = ">=2.0.0"
+identify = ">=1.0.0"
+nodeenv = ">=0.11.1"
+pyyaml = ">=5.1"
+virtualenv = ">=20.10.0"
+
 [[package]]
 name = "proto-plus"
 version = "1.26.0"
@@ -1902,7 +2006,7 @@ version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["docs"]
+groups = ["dev", "docs"]
 files = [
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
@@ -2911,6 +3015,27 @@ typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
 [package.extras]
 standard = ["colorama (>=0.4)", "httptools (>=0.6.3)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"]

+[[package]]
+name = "virtualenv"
+version = "20.29.3"
+description = "Virtual Python Environment builder"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+files = [
+    {file = "virtualenv-20.29.3-py3-none-any.whl", hash = "sha256:3e3d00f5807e83b234dfb6122bf37cfadf4be216c53a49ac059d02414f819170"},
+    {file = "virtualenv-20.29.3.tar.gz", hash = "sha256:95e39403fcf3940ac45bc717597dba16110b74506131845d9b687d5e73d947ac"},
+]
+
+[package.dependencies]
+distlib = ">=0.3.7,<1"
+filelock = ">=3.12.2,<4"
+platformdirs = ">=3.9.1,<5"
+
+[package.extras]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
+
 [[package]]
 name = "vk-api"
 version = "11.9.9"
@@ -3213,4 +3338,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "0feae518c3a51717bd80e90eea3cd3ed53925af656f00b662c856bae38a742bb"
+content-hash = "fbd6cdff4eb38021115a8cd361df7c292733028822f92f45cb667971c4bce901"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,7 @@ autopep8 = "^2.3.1"
 pytest-loguru = "^0.4.0"
 pytest-mock = "^3.14.0"
 ruff = "^0.9.10"
+pre-commit = "^4.1.0"

 [tool.poetry.group.docs.dependencies]
 sphinx = "^8.1.3"
@@ -96,23 +97,23 @@ markers = [
 #exclude = ["docs"]
 line-length = 120
 # Remove this for a more detailed lint report
-#output-format = "concise"
+output-format = "concise"


 [tool.ruff.lint]
-#add bugbear?
-# I : isort
-# UP : upgrade, e.g. use fstrings
-# ANN : annotations
-extend-select = ["B"]
+# Extend the rules to check for by adding them to this option:
+# See documentation for more details: https://docs.astral.sh/ruff/rules/
+#extend-select = ["B"]

 # Ignore unused imports as some are currently required for lazy loading
-# This can be removed for a `lint check` run which is manually reviewed
-ignore = ["F401"]
+# This can be removed for a `ruff check` run which is manually reviewed
+#ignore = ["F401"]

 [tool.ruff.lint.per-file-ignores]
 # Ignore import violations in __init__.py files
 "__init__.py" = ["F401", "F403"]
+# Ignore 'useless expression' in manifest files.
+"__manifest__.py" = ["B018"]

 [tool.ruff.format]
 docstring-code-format = false
--- a/src/auto_archiver/core/consts.py
+++ b/src/auto_archiver/core/consts.py
@@ -1,25 +1,19 @@
 class SetupError(ValueError):
    pass

-MODULE_TYPES = [
-    'feeder',
-    'extractor',
-    'enricher',
-    'database',
-    'storage',
-    'formatter'
-]
+
+MODULE_TYPES = ["feeder", "extractor", "enricher", "database", "storage", "formatter"]

 MANIFEST_FILE = "__manifest__.py"

 DEFAULT_MANIFEST = {
-    'name': '', # the display name of the module
-    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
-    'type': [], # the type of the module, can be one or more of MODULE_TYPES
-    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
-    'description': '', # a description of the module
-    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
-    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
-    'version': '1.0', # the version of the module
-    'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
+    "name": "",  # the display name of the module
+    "author": "Bellingcat",  # creator of the module, leave this as Bellingcat or set your own name!
+    "type": [],  # the type of the module, can be one or more of MODULE_TYPES
+    "requires_setup": True,  # whether or not this module requires additional setup such as setting API Keys or installing additional software
+    "description": "",  # a description of the module
+    "dependencies": {},  # external dependencies, e.g. python packages or binaries, in dictionary format
+    "entry_point": "",  # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
+    "version": "1.0",  # the version of the module
+    "configs": {},  # any configuration options this module has, these will be exposed to the user in the config file or via the command line
 }
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -1,6 +1,6 @@
-""" Orchestrates all archiving steps, including feeding items,
-    archiving them with specific archivers, enrichment, storage,
-    formatting, database operations and clean up.
+"""Orchestrates all archiving steps, including feeding items,
+archiving them with specific archivers, enrichment, storage,
+formatting, database operations and clean up.

 """

@@ -19,8 +19,17 @@ import requests

 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
-from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_valid_config, \
-    DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
+from .config import (
+    read_yaml,
+    store_yaml,
+    to_dot_notation,
+    merge_dicts,
+    is_valid_config,
+    DefaultValidatingParser,
+    UniqueAppendAction,
+    AuthenticationJsonParseAction,
+    DEFAULT_CONFIG_FILE,
+)
 from .module import ModuleFactory, LazyBaseModule
 from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
 from .consts import MODULE_TYPES, SetupError
@@ -30,8 +39,8 @@ if TYPE_CHECKING:
    from .base_module import BaseModule
    from .module import LazyBaseModule

-class ArchivingOrchestrator:

+class ArchivingOrchestrator:
    # instance variables
    module_factory: ModuleFactory
    setup_finished: bool
@@ -61,30 +70,63 @@ class ArchivingOrchestrator:
            epilog="Check the code at https://github.com/bellingcat/auto-archiver",
            formatter_class=RichHelpFormatter,
        )
-        parser.add_argument('--help', '-h', action='store_true', dest='help', help='show a full help message and exit')
-        parser.add_argument('--version', action='version', version=__version__)
-        parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
-        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
+        parser.add_argument("--help", "-h", action="store_true", dest="help", help="show a full help message and exit")
+        parser.add_argument("--version", action="version", version=__version__)
+        parser.add_argument(
+            "--config",
+            action="store",
+            dest="config_file",
+            help="the filename of the YAML configuration file (defaults to 'config.yaml')",
+            default=DEFAULT_CONFIG_FILE,
+        )
+        parser.add_argument(
+            "--mode",
+            action="store",
+            dest="mode",
+            type=str,
+            choices=["simple", "full"],
+            help="the mode to run the archiver in",
+            default="simple",
+        )
        # override the default 'help' so we can inject all the configs and show those
-        parser.add_argument('-s', '--store', dest='store', default=False, help='Store the created config in the config file', action=argparse.BooleanOptionalAction)
-        parser.add_argument('--module_paths', dest='module_paths', nargs='+', default=[], help='additional paths to search for modules', action=UniqueAppendAction)
+        parser.add_argument(
+            "-s",
+            "--store",
+            dest="store",
+            default=False,
+            help="Store the created config in the config file",
+            action=argparse.BooleanOptionalAction,
+        )
+        parser.add_argument(
+            "--module_paths",
+            dest="module_paths",
+            nargs="+",
+            default=[],
+            help="additional paths to search for modules",
+            action=UniqueAppendAction,
+        )

        self.basic_parser = parser
        return parser
-    
+
    def check_steps(self, config):
        for module_type in MODULE_TYPES:
-            if not config['steps'].get(f"{module_type}s", []):
-                if module_type == 'feeder' or module_type == 'formatter' and config['steps'].get(f"{module_type}"):
-                    raise SetupError(f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
-Here's how that would look: \n\nsteps:\n  {module_type}s:\n  - [your_{module_type}_name_here]\n  {'extractors:...' if module_type == 'feeder' else '...'}\n")
-                if module_type == 'extractor' and config['steps'].get('archivers'):
-                    raise SetupError("As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
-Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_here]\n  enrichers:...\n")
-                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
+            if not config["steps"].get(f"{module_type}s", []):
+                if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
+                    raise SetupError(
+                        f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  {module_type}s:\n  - [your_{module_type}_name_here]\n  {'extractors:...' if module_type == 'feeder' else '...'}\n"
+                    )
+                if module_type == "extractor" and config["steps"].get("archivers"):
+                    raise SetupError(
+                        "As of version 0.13.0 of Auto Archiver, the 'archivers' step name has been changed to 'extractors'. Change this in your configuration file and try again. \
+Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_here]\n  enrichers:...\n"
+                    )
+                raise SetupError(
+                    f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
+                )

    def setup_complete_parser(self, basic_config: dict, yaml_config: dict, unused_args: list[str]) -> None:
-
        # modules parser to get the overridden 'steps' values
        modules_parser = argparse.ArgumentParser(
            add_help=False,
@@ -92,7 +134,9 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        self.add_modules_args(modules_parser)
        cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
        for module_type in MODULE_TYPES:
-            yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
+            yaml_config["steps"][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config[
+                "steps"
+            ].get(f"{module_type}s", [])

        parser = DefaultValidatingParser(
            add_help=False,
@@ -115,30 +159,32 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            enabled_modules = []
            # first loads the modules from the config file, then from the command line
            for module_type in MODULE_TYPES:
-                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
+                enabled_modules.extend(yaml_config["steps"].get(f"{module_type}s", []))

            # clear out duplicates, but keep the order
            enabled_modules = list(dict.fromkeys(enabled_modules))
-            avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
+            avail_modules = self.module_factory.available_modules(
+                limit_to_modules=enabled_modules, suppress_warnings=True
+            )
            self.add_individual_module_args(avail_modules, parser)
-        elif basic_config.mode == 'simple':
+        elif basic_config.mode == "simple":
            simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
            self.add_individual_module_args(simple_modules, parser)

            # add them to the config
            for module in simple_modules:
                for module_type in module.type:
-                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
+                    yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)
        else:
            # load all modules, they're not using the 'simple' mode
            all_modules = self.module_factory.available_modules()
            # add all the modules to the steps
            for module in all_modules:
                for module_type in module.type:
-                    yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
+                    yaml_config["steps"].setdefault(f"{module_type}s", []).append(module.name)

            self.add_individual_module_args(all_modules, parser)
-        
+
        parser.set_defaults(**to_dot_notation(yaml_config))

        # reload the parser with the new arguments, now that we have them
@@ -164,43 +210,76 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            store_yaml(config, basic_config.config_file)

        return config
-    
+
    def add_modules_args(self, parser: argparse.ArgumentParser = None):
        if not parser:
            parser = self.parser

        # Module loading from the command line
        for module_type in MODULE_TYPES:
-            parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
+            parser.add_argument(
+                f"--{module_type}s",
+                dest=f"{module_type}s",
+                nargs="+",
+                help=f"the {module_type}s to use",
+                default=[],
+                action=UniqueAppendAction,
+            )

    def add_additional_args(self, parser: argparse.ArgumentParser = None):
        if not parser:
            parser = self.parser

-        parser.add_argument('--authentication', dest='authentication', help='A dictionary of sites and their authentication methods \
+        parser.add_argument(
+            "--authentication",
+            dest="authentication",
+            help="A dictionary of sites and their authentication methods \
                                                                            (token, username etc.) that extractors can use to log into \
                                                                            a website. If passing this on the command line, use a JSON string. \
-                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.',
-                                                                            default={},
-                                                                            nargs="?",
-                                                                            action=AuthenticationJsonParseAction)
+                                                                            You may also pass a path to a valid JSON/YAML file which will be parsed.",
+            default={},
+            nargs="?",
+            action=AuthenticationJsonParseAction,
+        )

        # logging arguments
-        parser.add_argument('--logging.level', action='store', dest='logging.level', choices=['INFO', 'DEBUG', 'ERROR', 'WARNING'], help='the logging level to use', default='INFO', type=str.upper)
-        parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
-        parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
-
-    def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
+        parser.add_argument(
+            "--logging.level",
+            action="store",
+            dest="logging.level",
+            choices=["INFO", "DEBUG", "ERROR", "WARNING"],
+            help="the logging level to use",
+            default="INFO",
+            type=str.upper,
+        )
+        parser.add_argument(
+            "--logging.file", action="store", dest="logging.file", help="the logging file to write to", default=None
+        )
+        parser.add_argument(
+            "--logging.rotation",
+            action="store",
+            dest="logging.rotation",
+            help="the logging rotation to use",
+            default=None,
+        )

+    def add_individual_module_args(
+        self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None
+    ) -> None:
        if not modules:
            modules = self.module_factory.available_modules()
-        
+
        for module in modules:
-            if module.name == 'cli_feeder':
+            if module.name == "cli_feeder":
                # special case. For the CLI feeder, allow passing URLs directly on the command line without setting --cli_feeder.urls=
-                parser.add_argument('urls', nargs='*', default=[], help='URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml')
+                parser.add_argument(
+                    "urls",
+                    nargs="*",
+                    default=[],
+                    help="URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
+                )
                continue
-                
+
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
@@ -209,21 +288,21 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")

            for name, kwargs in module.configs.items():
-                if not kwargs.get('metavar', None):
+                if not kwargs.get("metavar", None):
                    # make a nicer metavar, metavar is what's used in the help, e.g. --cli_feeder.urls [METAVAR]
-                    kwargs['metavar'] = name.upper()
+                    kwargs["metavar"] = name.upper()

-                if kwargs.get('required', False):
+                if kwargs.get("required", False):
                    # required args shouldn't have a 'default' value, remove it
-                    kwargs.pop('default', None)
+                    kwargs.pop("default", None)

-                kwargs.pop('cli_set', None)
-                should_store = kwargs.pop('should_store', False)
-                kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
+                kwargs.pop("cli_set", None)
+                should_store = kwargs.pop("should_store", False)
+                kwargs["dest"] = f"{module.name}.{kwargs.pop('dest', name)}"
                try:
-                    kwargs['type'] = getattr(validators, kwargs.get('type', '__invalid__'))
+                    kwargs["type"] = getattr(validators, kwargs.get("type", "__invalid__"))
                except AttributeError:
-                    kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
+                    kwargs["type"] = __builtins__.get(kwargs.get("type"), str)
                arg = group.add_argument(f"--{module.name}.{name}", **kwargs)
                arg.should_store = should_store

@@ -238,12 +317,11 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        self.basic_parser.exit()

    def setup_logging(self, config):
+        logging_config = config["logging"]

-        logging_config = config['logging']
-
-        if logging_config.get('enabled', True) is False:
+        if logging_config.get("enabled", True) is False:
            # disabled logging settings, they're set on a higher level
-            logger.disable('auto_archiver')
+            logger.disable("auto_archiver")
            return

        # setup loguru logging
@@ -253,38 +331,45 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            pass

        # add other logging info
-        if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
-            self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
-            if log_file := logging_config['file']:
-                logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
+        if self.logger_id is None:  # note - need direct comparison to None since need to consider falsy value 0
+            self.logger_id = logger.add(sys.stderr, level=logging_config["level"])
+            if log_file := logging_config["file"]:
+                logger.add(log_file) if not logging_config["rotation"] else logger.add(
+                    log_file, rotation=logging_config["rotation"]
+                )

    def install_modules(self, modules_by_type):
        """
-        Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the 
+        Traverses all modules in 'steps' and loads them into the orchestrator, storing them in the
        orchestrator's attributes (self.feeders, self.extractors etc.). If no modules of a certain type
        are loaded, the program will exit with an error message.
        """

        invalid_modules = []
        for module_type in MODULE_TYPES:
-
            step_items = []
            modules_to_load = modules_by_type[f"{module_type}s"]
            if not modules_to_load:
-                raise SetupError(f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)")
+                raise SetupError(
+                    f"No {module_type}s were configured. Make sure to set at least one {module_type} in your configuration file or on the command line (using --{module_type}s)"
+                )

            def check_steps_ok():
                if not len(step_items):
                    if len(modules_to_load):
-                        logger.error(f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}")
-                    raise SetupError(f"NO {module_type.upper()}S LOADED. Please check your configuration and try again.")
-                
+                        logger.error(
+                            f"Unable to load any {module_type}s. Tried the following, but none were available: {modules_to_load}"
+                        )
+                    raise SetupError(
+                        f"NO {module_type.upper()}S LOADED. Please check your configuration and try again."
+                    )

-                if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
-                    raise SetupError(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
+                if (module_type == "feeder" or module_type == "formatter") and len(step_items) > 1:
+                    raise SetupError(
+                        f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}"
+                    )

            for module in modules_to_load:
-
                if module in invalid_modules:
                    continue

@@ -293,7 +378,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
                except (KeyboardInterrupt, Exception) as e:
                    logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
-                    if loaded_module and module_type == 'extractor':
+                    if loaded_module and module_type == "extractor":
                        loaded_module.cleanup()
                    raise e

@@ -308,11 +393,13 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_

    def load_config(self, config_file: str) -> dict:
        if not os.path.exists(config_file) and config_file != DEFAULT_CONFIG_FILE:
-            logger.error(f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
+            logger.error(
+                f"The configuration file {config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings."
+            )
            raise FileNotFoundError(f"Configuration file {config_file} not found")

        return read_yaml(config_file)
-    
+
    def setup_config(self, args: list) -> dict:
        """
        Sets up the configuration file, merging the default config with the user's config
@@ -335,13 +422,13 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        yaml_config = self.load_config(basic_config.config_file)

        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
-    
+
    def check_for_updates(self):
        response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
-        latest_version = response['info']['version']
+        latest_version = response["info"]["version"]
        # check version compared to current version
        if latest_version != __version__:
-            if os.environ.get('RUNNING_IN_DOCKER'):
+            if os.environ.get("RUNNING_IN_DOCKER"):
                update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
            else:
                update_cmd = "`pip install --upgrade auto-archiver`"
@@ -351,33 +438,36 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
            logger.warning("")

-        
    def setup(self, args: list):
        """
        Function to configure all setup of the orchestrator: setup configs and load modules.
-        
+
        This method should only ever be called once
        """

        self.check_for_updates()

        if self.setup_finished:
-            logger.warning("The `setup_config()` function should only ever be run once. \
+            logger.warning(
+                "The `setup_config()` function should only ever be run once. \
                           If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
                           For code implementatations, you should call .setup_config() once then you may call .feed() \
-                           multiple times to archive multiple URLs.")
+                           multiple times to archive multiple URLs."
+            )
            return

        self.setup_basic_parser()
        self.config = self.setup_config(args)

        logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
-        self.install_modules(self.config['steps'])
+        self.install_modules(self.config["steps"])

        # log out the modules that were loaded
        for module_type in MODULE_TYPES:
-            logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
-        
+            logger.info(
+                f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))
+            )
+
        self.setup_finished = True

    def _command_line_run(self, args: list) -> Generator[Metadata]:
@@ -385,9 +475,9 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        This is the main entry point for the orchestrator, when run from the command line.

        :param args: list of arguments to pass to the orchestrator - these are the command line args
-        
+
        You should not call this method from code implementations.
-          
+
        This method sets up the configuration, loads the modules, and runs the feed.
        If you wish to make code invocations yourself, you should use the 'setup' and 'feed' methods separately.
        To test configurations, without loading any modules you can also first call 'setup_configs'
@@ -396,7 +486,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            self.setup(args)
            return self.feed()
        except Exception as e:
-            logger.error(e)
+            logger.error(e, exc_info=True)
            exit(1)

    def cleanup(self) -> None:
@@ -405,7 +495,6 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            e.cleanup()

    def feed(self) -> Generator[Metadata]:
-        
        url_count = 0
        for feeder in self.feeders:
            for item in feeder:
@@ -436,7 +525,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
            self.cleanup()
            exit()
        except Exception as e:
-            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
+            logger.error(f"Got unexpected error on item {item}: {e}\n{traceback.format_exc()}")
            for d in self.databases:
                if isinstance(e, AssertionError):
                    d.failed(item, str(e))
@@ -451,13 +540,13 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_

    def archive(self, result: Metadata) -> Union[Metadata, None]:
        """
-            Runs the archiving process for a single URL
-            1. Each archiver can sanitize its own URLs
-            2. Check for cached results in Databases, and signal start to the databases
-            3. Call Archivers until one succeeds
-            4. Call Enrichers
-            5. Store all downloaded/generated media
-            6. Call selected Formatter and store formatted if needed
+        Runs the archiving process for a single URL
+        1. Each archiver can sanitize its own URLs
+        2. Check for cached results in Databases, and signal start to the databases
+        3. Call Archivers until one succeeds
+        4. Call Enrichers
+        5. Store all downloaded/generated media
+        6. Call selected Formatter and store formatted if needed
        """

        original_url = result.get_url().strip()
@@ -528,7 +617,6 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}")

        return result
-    

    def setup_authentication(self, config: dict) -> dict:
        """
@@ -537,7 +625,7 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        Split up strings into multiple sites if they are comma separated
        """

-        authentication = config.get('authentication', {})
+        authentication = config.get("authentication", {})

        # extract out concatenated sites
        for key, val in copy(authentication).items():
@@ -546,8 +634,8 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                    site = site.strip()
                    authentication[site] = val
                del authentication[key]
-        
-        config['authentication'] = authentication
+
+        config["authentication"] = authentication
        return config

    # Helper Properties
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@@ -32,16 +32,16 @@ from auto_archiver.utils.misc import random_str
 from auto_archiver.core import Media, BaseModule, Metadata
 from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher

+
 class Storage(BaseModule):
-    
    """
    Base class for implementing storage modules in the media archiving framework.

    Subclasses must implement the `get_cdn_url` and `uploadf` methods to define their behavior.
    """

-    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
-        if media.is_stored(in_storage=self): 
+    def store(self, media: Media, url: str, metadata: Metadata = None) -> None:
+        if media.is_stored(in_storage=self):
            logger.debug(f"{media.key} already stored, skipping")
            return

@@ -73,18 +73,18 @@ class Storage(BaseModule):
        This method should not be called directly, but instead be called through the 'store' method,
        which sets up the media for storage.
        """
-        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
-        with open(media.filename, 'rb') as f:
+        logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key}")
+        with open(media.filename, "rb") as f:
            return self.uploadf(f, media, **kwargs)

    def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
-        
+
        if media.key is not None and len(media.key) > 0:
            # media key is already set
            return

-        folder = metadata.get_context('folder', '')
+        folder = metadata.get_context("folder", "")
        filename, ext = os.path.splitext(media.filename)

        # Handle path_generator logic
@@ -104,12 +104,11 @@ class Storage(BaseModule):
            filename = random_str(24)
        elif filename_generator == "static":
            # load the hash_enricher module
-            he = self.module_factory.get_module("hash_enricher", self.config)
+            he: HashEnricher = self.module_factory.get_module("hash_enricher", self.config)
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
        else:
            raise ValueError(f"Invalid filename_generator: {filename_generator}")
-        
-        key = os.path.join(folder, path, f"{filename}{ext}")

-        media._key = key
+        key = os.path.join(folder, path, f"{filename}{ext}")
+        media._key = key
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -12,9 +12,7 @@
            "default": None,
            "help": "the id of the sheet to archive (alternative to 'sheet' config)",
        },
-        "header": {"default": 1,
-                   "help": "index of the header row (starts at 1)",
-                   "type": "int"},
+        "header": {"default": 1, "help": "index of the header row (starts at 1)", "type": "int"},
        "service_account": {
            "default": "secrets/service_account.json",
            "help": "service account JSON file path. Learn how to create one: https://gspread.readthedocs.io/en/latest/oauth2.html",
--- a/src/auto_archiver/modules/local_storage/local_storage.py
+++ b/src/auto_archiver/modules/local_storage/local_storage.py
@@ -1,4 +1,3 @@
-
 import shutil
 from typing import IO
 import os
@@ -8,12 +7,13 @@ from auto_archiver.core import Media
 from auto_archiver.core import Storage
 from auto_archiver.core.consts import SetupError

+
 class LocalStorage(Storage):
-
-
    def setup(self) -> None:
        if len(self.save_to) > 200:
-            raise SetupError("Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
+            raise SetupError(
+                "Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path."
+            )

    def get_cdn_url(self, media: Media) -> str:
        dest = media.key
@@ -25,18 +25,18 @@ class LocalStorage(Storage):
    def set_key(self, media, url, metadata):
        # clarify we want to save the file to the save_to folder

-        old_folder = metadata.get('folder', '')
-        metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
+        old_folder = metadata.get("folder", "")
+        metadata.set_context("folder", os.path.join(self.save_to, metadata.get("folder", "")))
        super().set_key(media, url, metadata)
        # don't impact other storages that might want a different 'folder' set
-        metadata.set_context('folder', old_folder)
+        metadata.set_context("folder", old_folder)

    def upload(self, media: Media, **kwargs) -> bool:
        # override parent so that we can use shutil.copy2 and keep metadata
        dest = media.key

        os.makedirs(os.path.dirname(dest), exist_ok=True)
-        logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
+        logger.debug(f"[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}")

        res = shutil.copy2(media.filename, dest)
        logger.info(res)
@@ -44,4 +44,4 @@ class LocalStorage(Storage):

    # must be implemented even if unused
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
-        pass
+        pass
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -1,4 +1,3 @@
-
 from typing import IO

 import boto3
@@ -11,18 +10,20 @@ from auto_archiver.utils.misc import calculate_file_hash, random_str

 NO_DUPLICATES_FOLDER = "no-dups/"

-class S3Storage(Storage):

+class S3Storage(Storage):
    def setup(self) -> None:
        self.s3 = boto3.client(
-            's3',
+            "s3",
            region_name=self.region,
            endpoint_url=self.endpoint_url.format(region=self.region),
            aws_access_key_id=self.key,
-            aws_secret_access_key=self.secret
+            aws_secret_access_key=self.secret,
        )
        if self.random_no_duplicate:
-            logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
+            logger.warning(
+                "random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`."
+            )

    def get_cdn_url(self, media: Media) -> str:
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
@@ -32,13 +33,13 @@ class S3Storage(Storage):
            return True

        extra_args = kwargs.get("extra_args", {})
-        if not self.private and 'ACL' not in extra_args:
-            extra_args['ACL'] = 'public-read'
+        if not self.private and "ACL" not in extra_args:
+            extra_args["ACL"] = "public-read"

-        if 'ContentType' not in extra_args:
+        if "ContentType" not in extra_args:
            try:
                if media.mimetype:
-                    extra_args['ContentType'] = media.mimetype
+                    extra_args["ContentType"] = media.mimetype
            except Exception as e:
                logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
@@ -50,21 +51,21 @@ class S3Storage(Storage):
            hd = calculate_file_hash(media.filename)
            path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])

-            if existing_key:=self.file_in_folder(path):
+            if existing_key := self.file_in_folder(path):
                media._key = existing_key
                media.set("previously archived", True)
                logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
                return False
-            
+
            _, ext = os.path.splitext(media.key)
            media._key = os.path.join(path, f"{random_str(24)}{ext}")
        return True

-    def file_in_folder(self, path:str) -> str:
+    def file_in_folder(self, path: str) -> str:
        # checks if path exists and is not an empty folder
-        if not path.endswith('/'):
-            path = path + '/' 
-        resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
-        if 'Contents' in resp:
-            return resp['Contents'][0]['Key']
-        return False
+        if not path.endswith("/"):
+            path = path + "/"
+        resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter="/", MaxKeys=1)
+        if "Contents" in resp:
+            return resp["Contents"][0]["Key"]
+        return False
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/init.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/init.py
@@ -1 +1 @@
-from .tiktok_tikwm_extractor import TiktokTikwmExtractor
+from .tiktok_tikwm_extractor import TiktokTikwmExtractor
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/manifest.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/manifest.py
@@ -2,10 +2,7 @@
    "name": "Tiktok Tikwm Extractor",
    "type": ["extractor"],
    "requires_setup": False,
-    "dependencies": {
-        "python": ["loguru", "requests"],
-        "bin": []
-    },
+    "dependencies": {"python": ["loguru", "requests"], "bin": []},
    "description": """
    Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
 	
@@ -19,5 +16,5 @@
    - If tikwm.com is down, this extractor will not work.
 	- If tikwm.com changes their API, this extractor may break.
 	- If no video is found, this extractor will consider the extraction failed.
-    """
+    """,
 }
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py
@@ -12,11 +12,12 @@ class TiktokTikwmExtractor(Extractor):
    """
    Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
    """
+
    TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"

    def download(self, item: Metadata) -> bool | Metadata:
        url = item.get_url()
-        
+
        if not re.match(TikTokIE._VALID_URL, url):
            return False

@@ -33,7 +34,7 @@ class TiktokTikwmExtractor(Extractor):
            logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
            return False

-        if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
+        if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
            logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
            return False

@@ -67,7 +68,7 @@ class TiktokTikwmExtractor(Extractor):
        if created_at := api_data.pop("create_time", None):
            result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))

-        if (author := api_data.pop("author", None)):
+        if author := api_data.pop("author", None):
            result.set("author", author)

        result.set("api_data", api_data)
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -14,9 +14,7 @@
            "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
        },
        "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
-        "timeout": {"default": 120,
-                    "help": "timeout for WACZ generation in seconds",
-                    "type": "int"},
+        "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
        "extract_media": {
            "default": False,
            "type": "bool",
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -22,7 +22,9 @@ TESTS_TO_RUN_LAST = ["test_twitter_api_archiver"]

@pytest.fixture
 def setup_module(request):
-    def _setup_module(module_name, config={}):
+    def _setup_module(module_name, config=None):
+        if config is None:
+            config = {}
        module_factory = ModuleFactory()

        if isinstance(module_name, type):
--- a/tests/extractors/test_tiktok_tikwm_extractor.py
+++ b/tests/extractors/test_tiktok_tikwm_extractor.py
@@ -24,17 +24,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
        mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
        return mock_get, mock_logger

-    @pytest.mark.parametrize("url,valid_url", [
-        ("https://bellingcat.com", False),
-        ("https://youtube.com", False),
-        ("https://tiktok.co/", False),
-        ("https://tiktok.com/", False),
-        ("https://www.tiktok.com/", False),
-        ("https://api.cool.tiktok.com/", False),
-        (VALID_EXAMPLE_URL, True),
-        ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
-        ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
-    ])
+    @pytest.mark.parametrize(
+        "url,valid_url",
+        [
+            ("https://bellingcat.com", False),
+            ("https://youtube.com", False),
+            ("https://tiktok.co/", False),
+            ("https://tiktok.com/", False),
+            ("https://www.tiktok.com/", False),
+            ("https://api.cool.tiktok.com/", False),
+            (VALID_EXAMPLE_URL, True),
+            ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
+            ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
+        ],
+    )
    def test_valid_urls(self, mocker, make_item, url, valid_url):
        mock_get, mock_logger = self.get_mockers(mocker)
        if valid_url:
@@ -53,17 +56,20 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
        mock_logger.error.assert_called_once()
        assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")

-        mock_get.return_value.json.side_effect = Exception
-        with pytest.raises(Exception):
+        mock_get.return_value.json.side_effect = ValueError
+        with pytest.raises(ValueError):
            self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
        mock_get.assert_called()
        assert mock_get.call_count == 2
        assert mock_get.return_value.json.call_count == 2

-    @pytest.mark.parametrize("response", [
-        ({"msg": "failure"}),
-        ({"msg": "success"}),
-    ])
+    @pytest.mark.parametrize(
+        "response",
+        [
+            ({"msg": "failure"}),
+            ({"msg": "success"}),
+        ],
+    )
    def test_unsuccessful_responses(self, mocker, make_item, response):
        mock_get, mock_logger = self.get_mockers(mocker)
        mock_get.return_value.status_code = 200
@@ -74,11 +80,14 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
        mock_logger.error.assert_called_once()
        assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")

-    @pytest.mark.parametrize("response,has_vid", [
-        ({"data": {"id": 123}}, False),
-        ({"data": {"wmplay": "url"}}, True),
-        ({"data": {"play": "url"}}, True),
-    ])
+    @pytest.mark.parametrize(
+        "response,has_vid",
+        [
+            ({"data": {"id": 123}}, False),
+            ({"data": {"wmplay": "url"}}, True),
+            ({"data": {"play": "url"}}, True),
+        ],
+    )
    def test_correct_extraction(self, mocker, make_item, response, has_vid):
        mock_get, mock_logger = self.get_mockers(mocker)
        mock_get.return_value.status_code = 200
@@ -102,16 +111,19 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
    def test_correct_data_extracted(self, mocker, make_item):
        mock_get, _ = self.get_mockers(mocker)
        mock_get.return_value.status_code = 200
-        mock_get.return_value.json.return_value = {"msg": "success", "data": {
-            "wmplay": "url",
-            "origin_cover": "cover.jpg",
-            "title": "Title",
-            "id": 123,
-            "duration": 60,
-            "create_time": 1736301699,
-            "author": "Author",
-            "other": "data"
-        }}
+        mock_get.return_value.json.return_value = {
+            "msg": "success",
+            "data": {
+                "wmplay": "url",
+                "origin_cover": "cover.jpg",
+                "title": "Title",
+                "id": 123,
+                "duration": 60,
+                "create_time": 1736301699,
+                "author": "Author",
+                "other": "data",
+            },
+        }

        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
        assert result.is_success()
@@ -129,9 +141,12 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
        result = self.extractor.download(make_item(url))
        assert result.is_success()
        assert len(result.media) == 2
-        assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg  #A23a  #Antarctica  #Ice  #ClimateChange  #DavidAttenborough  #Ocean  #Sea  #SouthGeorgia  #BBCNews "
+        assert (
+            result.get_title()
+            == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg  #A23a  #Antarctica  #Ice  #ClimateChange  #DavidAttenborough  #Ocean  #Sea  #SouthGeorgia  #BBCNews "
+        )
        assert result.get("author").get("unique_id") == "bbcnews"
-        assert result.get("api_data").get("id") == '7478038212070411542'
+        assert result.get("api_data").get("id") == "7478038212070411542"
        assert result.media[1].get("duration") == 59
        assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)

@@ -149,6 +164,6 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
        assert len(result.media) == 2
        assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
        assert result.get("author").get("id") == "7197400619475649562"
-        assert result.get("api_data").get("id") == '7441821351142362375'
+        assert result.get("api_data").get("id") == "7441821351142362375"
        assert result.media[1].get("duration") == 34
        assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
--- a/tests/storages/test_S3_storage.py
+++ b/tests/storages/test_S3_storage.py
@@ -8,6 +8,7 @@ class TestS3Storage:
    """
    Test suite for S3Storage.
    """
+
    module_name: str = "s3_storage"
    storage: Type[S3Storage]
    config: dict = {
@@ -32,10 +33,10 @@ class TestS3Storage:
        """Test that S3 client is initialized with correct parameters"""

        assert self.storage.s3 is not None
-        assert self.storage.s3.meta.region_name == 'test-region'
+        assert self.storage.s3.meta.region_name == "test-region"

    def test_get_cdn_url_generation(self):
-        """Test CDN URL formatting """
+        """Test CDN URL formatting"""
        media = Media("test.txt")
        media._key = "path/to/file.txt"
        url = self.storage.get_cdn_url(media)
@@ -46,14 +47,14 @@ class TestS3Storage:
    def test_uploadf_sets_acl_public(self, mocker):
        media = Media("test.txt")
        mock_file = mocker.MagicMock()
-        mock_s3_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
-        mocker.patch.object(self.storage, 'is_upload_needed', return_value=True)
+        mock_s3_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
+        mocker.patch.object(self.storage, "is_upload_needed", return_value=True)
        self.storage.uploadf(mock_file, media)
        mock_s3_upload.assert_called_once_with(
            mock_file,
-            Bucket='test-bucket',
+            Bucket="test-bucket",
            Key=media.key,
-            ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/plain'}
+            ExtraArgs={"ACL": "public-read", "ContentType": "text/plain"},
        )

    def test_upload_decision_logic(self, mocker):
@@ -61,23 +62,29 @@ class TestS3Storage:
        media = Media("test.txt")
        assert self.storage.is_upload_needed(media) is True
        self.storage.random_no_duplicate = True
-        mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value='beepboop123beepboop123beepboop123')
-        mock_file_in_folder = mocker.patch.object(self.storage, 'file_in_folder', return_value='existing_key.txt')
+        mocker.patch(
+            "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
+            return_value="beepboop123beepboop123beepboop123",
+        )
+        mock_file_in_folder = mocker.patch.object(self.storage, "file_in_folder", return_value="existing_key.txt")
        assert self.storage.is_upload_needed(media) is False
-        assert media.key == 'existing_key.txt'
-        mock_file_in_folder.assert_called_with('no-dups/beepboop123beepboop123be')
+        assert media.key == "existing_key.txt"
+        mock_file_in_folder.assert_called_with("no-dups/beepboop123beepboop123be")

    def test_skips_upload_when_duplicate_exists(self, mocker):
        """Test that upload skips when file_in_folder finds existing object"""
        self.storage.random_no_duplicate = True
-        mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
+        mocker.patch.object(S3Storage, "file_in_folder", return_value="existing_folder/existing_file.txt")
        media = Media("test.txt")
        media._key = "original_path.txt"
-        mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
+        mocker.patch(
+            "auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash",
+            return_value="beepboop123beepboop123beepboop123",
+        )
        assert self.storage.is_upload_needed(media) is False
        assert media.key == "existing_folder/existing_file.txt"
        assert media.get("previously archived") is True
-        mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
+        mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
        result = self.storage.uploadf(None, media)
        mock_upload.assert_not_called()
        assert result is True
@@ -85,21 +92,18 @@ class TestS3Storage:
    def test_uploads_with_correct_parameters(self, mocker):
        media = Media("test.txt")
        media._key = "original_key.txt"
-        mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
-        media.mimetype = 'image/png'
+        mocker.patch.object(S3Storage, "is_upload_needed", return_value=True)
+        media.mimetype = "image/png"
        mock_file = mocker.MagicMock()
-        mock_upload = mocker.patch.object(self.storage.s3, 'upload_fileobj')
+        mock_upload = mocker.patch.object(self.storage.s3, "upload_fileobj")
        self.storage.uploadf(mock_file, media)
        mock_upload.assert_called_once_with(
            mock_file,
-            Bucket='test-bucket',
-            Key='original_key.txt',
-            ExtraArgs={
-                'ACL': 'public-read',
-                'ContentType': 'image/png'
-            }
+            Bucket="test-bucket",
+            Key="original_key.txt",
+            ExtraArgs={"ACL": "public-read", "ContentType": "image/png"},
        )

    def test_file_in_folder_exists(self, mocker):
-        mocker.patch.object(self.storage.s3, 'list_objects', return_value={'Contents': [{'Key': 'path/to/file.txt'}]})
-        assert self.storage.file_in_folder('path/to/') == 'path/to/file.txt'
+        mocker.patch.object(self.storage.s3, "list_objects", return_value={"Contents": [{"Key": "path/to/file.txt"}]})
+        assert self.storage.file_in_folder("path/to/") == "path/to/file.txt"
--- a/tests/storages/test_local_storage.py
+++ b/tests/storages/test_local_storage.py
@@ -1,4 +1,3 @@
-
 import os
 from pathlib import Path

@@ -8,6 +7,7 @@ from auto_archiver.core import Media, Metadata
 from auto_archiver.modules.local_storage import LocalStorage
 from auto_archiver.core.consts import SetupError

+
@pytest.fixture
 def local_storage(setup_module, tmp_path) -> LocalStorage:
    save_to = tmp_path / "local_archive"
@@ -20,6 +20,7 @@ def local_storage(setup_module, tmp_path) -> LocalStorage:
    }
    return setup_module("local_storage", configs)

+
@pytest.fixture
 def sample_media(tmp_path) -> Media:
    """Fixture creating a Media object with temporary source file"""
@@ -27,9 +28,11 @@ def sample_media(tmp_path) -> Media:
    src_file.write_text("test content")
    return Media(filename=str(src_file))

+
 def test_too_long_save_path(setup_module):
    with pytest.raises(SetupError):
-        setup_module("local_storage", {"save_to": "long"*100})
+        setup_module("local_storage", {"save_to": "long" * 100})
+

 def test_get_cdn_url_relative(local_storage):
    local_storage.filename_generator = "random"
@@ -38,6 +41,7 @@ def test_get_cdn_url_relative(local_storage):
    expected = os.path.join(local_storage.save_to, media.key)
    assert local_storage.get_cdn_url(media) == expected

+
 def test_get_cdn_url_absolute(local_storage):
    local_storage.filename_generator = "random"

@@ -47,14 +51,14 @@ def test_get_cdn_url_absolute(local_storage):
    expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
    assert local_storage.get_cdn_url(media) == expected

+
 def test_upload_file_contents_and_metadata(local_storage, sample_media):
    local_storage.store(sample_media, "https://example.com", Metadata())
    dest = os.path.join(local_storage.save_to, sample_media.key)
    assert Path(sample_media.filename).read_text() == Path(dest).read_text()

+
 def test_upload_nonexistent_source(local_storage):
    media = Media(_key="missing.txt", filename="nonexistent.txt")
    with pytest.raises(FileNotFoundError):
        local_storage.upload(media)
-
-
--- a/tests/storages/test_storage_base.py
+++ b/tests/storages/test_storage_base.py
@@ -6,32 +6,28 @@ from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.core.storage import Storage
 from auto_archiver.core.module import ModuleFactory

-class TestStorageBase(object):

+class TestStorageBase(object):
    module_name: str = None
    config: dict = None

    @pytest.fixture(autouse=True)
    def setup_storage(self, setup_module):
-        assert (
-            self.module_name is not None
-        ), "self.module_name must be set on the subclass"
+        assert self.module_name is not None, "self.module_name must be set on the subclass"
        assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.storage: Type[Storage] = setup_module(
-            self.module_name, self.config
-        )
+        self.storage: Type[Storage] = setup_module(self.module_name, self.config)


 class TestBaseStorage(Storage):
-
    name = "test_storage"

    def get_cdn_url(self, media):
        return "cdn_url"
-    
+
    def uploadf(self, file, key, **kwargs):
        return True

+
@pytest.fixture
 def dummy_file(tmp_path):
    # create dummy.txt file
@@ -39,16 +35,18 @@ def dummy_file(tmp_path):
    dummy_file.write_text("test content")
    return str(dummy_file)

+
@pytest.fixture
 def storage_base():
    def _storage_base(config):
        storage_base = TestBaseStorage()
-        storage_base.config_setup({TestBaseStorage.name : config})
+        storage_base.config_setup({TestBaseStorage.name: config})
        storage_base.module_factory = ModuleFactory()
        return storage_base
-    
+
    return _storage_base

+
@pytest.mark.parametrize(
    "path_generator, filename_generator, url, expected_key",
    [
@@ -58,11 +56,11 @@ def storage_base():
        ("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"),
        ("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"),
        ("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"),
-
    ],
 )
-def test_storage_name_generation(storage_base, path_generator, filename_generator, url, 
-                                 expected_key, mocker, tmp_path, dummy_file):
+def test_storage_name_generation(
+    storage_base, path_generator, filename_generator, url, expected_key, mocker, tmp_path, dummy_file
+):
    mock_random = mocker.patch("auto_archiver.core.storage.random_str")
    mock_random.return_value = "pretend-random"

@@ -89,10 +87,10 @@ def test_really_long_name(storage_base, dummy_file):
    }
    storage: Storage = storage_base(config)

-    url = f"https://example.com/{'file'*100}"
+    url = f"https://example.com/{'file' * 100}"
    media = Media(filename=dummy_file)
    storage.set_key(media, url, Metadata())
-    assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"
+    assert media.key == f"https-example-com-{'file' * 13}/6ae8a75555209fd6c44157c0.txt"


 def test_storage_loads_hash_enricher(storage_base, dummy_file):