From e1a937333666862217ddda1e9baea869535d3377 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Mon, 27 Jan 2025 19:03:02 +0000 Subject: [PATCH] Refactoring for new config setup --- poetry.lock | 55 ++++++------- src/auto_archiver/base_processors/__init__.py | 6 -- src/auto_archiver/core/__init__.py | 9 ++- .../{base_processors => core}/database.py | 0 .../{base_processors => core}/enricher.py | 0 .../{base_processors => core}/extractor.py | 8 +- .../{base_processors => core}/feeder.py | 0 .../{base_processors => core}/formatter.py | 0 src/auto_archiver/core/module.py | 69 ++++++++-------- src/auto_archiver/core/orchestrator.py | 27 ++++--- .../{base_processors => core}/storage.py | 0 src/auto_archiver/modules/api_db/api_db.py | 3 +- src/auto_archiver/modules/atlos/atlos.py | 6 +- .../modules/atlos_db/atlos_db.py | 8 +- .../modules/atlos_feeder/atlos_feeder.py | 3 +- .../modules/cli_feeder/cli_feeder.py | 2 +- .../modules/console_db/console_db.py | 2 +- src/auto_archiver/modules/csv_db/csv_db.py | 2 +- .../modules/csv_feeder/csv_feeder.py | 4 +- .../modules/gdrive_storage/gdrive_storage.py | 3 +- .../modules/generic_extractor/__manifest__.py | 80 ++++++++++++++----- .../modules/generic_extractor/bluesky.py | 2 +- .../modules/generic_extractor/dropin.py | 2 +- .../generic_extractor/generic_extractor.py | 7 +- .../modules/generic_extractor/truth.py | 2 +- .../modules/generic_extractor/twitter.py | 2 +- .../modules/gsheet_db/gsheet_db.py | 7 +- .../modules/gsheet_feeder/gsheet_feeder.py | 3 +- .../modules/hash_enricher/__manifest__.py | 4 +- .../modules/hash_enricher/hash_enricher.py | 13 ++- .../modules/html_formatter/html_formatter.py | 28 ++++--- .../instagram_api_extractor.py | 4 +- .../instagram_extractor.py | 4 +- .../instagram_tbot_extractor.py | 9 +-- .../modules/local_storage/local_storage.py | 7 +- .../modules/meta_enricher/meta_enricher.py | 2 +- .../metadata_enricher/metadata_enricher.py | 2 +- .../modules/mute_formatter/mute_formatter.py | 3 +- .../pdq_hash_enricher/pdq_hash_enricher.py | 2 +- src/auto_archiver/modules/s3_storage/s3.py | 5 +- .../screenshot_enricher.py | 6 +- .../modules/ssl_enricher/ssl_enricher.py | 2 +- .../telegram_extractor/telegram_extractor.py | 2 +- .../telethon_extractor/telethon_extractor.py | 8 +- .../thumbnail_enricher/thumbnail_enricher.py | 4 +- .../timestamping_enricher.py | 8 +- .../twitter_api_extractor.py | 3 +- .../modules/vk_extractor/vk_extractor.py | 3 +- .../modules/wacz_enricher/wacz_enricher.py | 7 +- .../wayback_enricher/wayback_enricher.py | 9 +-- .../whisper_enricher/whisper_enricher.py | 10 +-- src/auto_archiver/utils/gsheet.py | 4 +- 52 files changed, 219 insertions(+), 242 deletions(-) delete mode 100644 src/auto_archiver/base_processors/__init__.py rename src/auto_archiver/{base_processors => core}/database.py (100%) rename src/auto_archiver/{base_processors => core}/enricher.py (100%) rename src/auto_archiver/{base_processors => core}/extractor.py (94%) rename src/auto_archiver/{base_processors => core}/feeder.py (100%) rename src/auto_archiver/{base_processors => core}/formatter.py (100%) rename src/auto_archiver/{base_processors => core}/storage.py (100%) diff --git a/poetry.lock b/poetry.lock index 128ede2..6d6ad8c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -64,14 +64,14 @@ typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} [[package]] name = "attrs" -version = "24.3.0" +version = "25.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, - {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, + {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, + {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] [package.extras] @@ -152,18 +152,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.36.3" +version = "1.36.6" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.36.3-py3-none-any.whl", hash = "sha256:f9843a5d06f501d66ada06f5a5417f671823af2cf319e36ceefa1bafaaaaa953"}, - {file = "boto3-1.36.3.tar.gz", hash = "sha256:53a5307f6a3526ee2f8590e3c45efa504a3ea4532c1bfe4926c0c19bf188d141"}, + {file = "boto3-1.36.6-py3-none-any.whl", hash = "sha256:6d473f0f340d02b4e9ad5b8e68786a09728101a8b950231b89ebdaf72b6dca21"}, + {file = "boto3-1.36.6.tar.gz", hash = "sha256:b36feae061dc0793cf311468956a0a9e99215ce38bc99a1a4e55a5b105f16297"}, ] [package.dependencies] -botocore = ">=1.36.3,<1.37.0" +botocore = ">=1.36.6,<1.37.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -172,14 +172,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.36.3" +version = "1.36.6" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.36.3-py3-none-any.whl", hash = "sha256:536ab828e6f90dbb000e3702ac45fd76642113ae2db1b7b1373ad24104e89255"}, - {file = "botocore-1.36.3.tar.gz", hash = "sha256:775b835e979da5c96548ed1a0b798101a145aec3cd46541d62e27dda5a94d7f8"}, + {file = "botocore-1.36.6-py3-none-any.whl", hash = "sha256:f77bbbb03fb420e260174650fb5c0cc142ec20a96967734eed2b0ef24334ef34"}, + {file = "botocore-1.36.6.tar.gz", hash = "sha256:4864c53d638da191a34daf3ede3ff1371a3719d952cc0c6bd24ce2836a38dd77"}, ] [package.dependencies] @@ -798,14 +798,14 @@ uritemplate = ">=3.0.1,<5" [[package]] name = "google-auth" -version = "2.37.0" +version = "2.38.0" description = "Google Authentication Library" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0"}, - {file = "google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00"}, + {file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"}, + {file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"}, ] [package.dependencies] @@ -958,13 +958,14 @@ files = [ [[package]] name = "instaloader" -version = "4.14" +version = "4.14.1" description = "Download pictures (or videos) along with their captions and other metadata from Instagram." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "instaloader-4.14.tar.gz", hash = "sha256:754425eb17af44ce4bb6056e4eacd044a518d13b5efc11b9d80eb229bb96c652"}, + {file = "instaloader-4.14.1-py3-none-any.whl", hash = "sha256:43356f696231621ea5a93354f9a4578124fe131940ee9aa1e83c20f57e18f26d"}, + {file = "instaloader-4.14.1.tar.gz", hash = "sha256:a41a7372a18fb096b3ed545469479884de9cf768e12020c0e0e67c488d9d599c"}, ] [package.dependencies] @@ -1135,14 +1136,14 @@ files = [ [[package]] name = "marshmallow" -version = "3.25.1" +version = "3.26.0" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "marshmallow-3.25.1-py3-none-any.whl", hash = "sha256:ec5d00d873ce473b7f2ffcb7104286a376c354cab0c2fa12f5573dab03e87210"}, - {file = "marshmallow-3.25.1.tar.gz", hash = "sha256:f4debda3bb11153d81ac34b0d582bf23053055ee11e791b54b4b35493468040a"}, + {file = "marshmallow-3.26.0-py3-none-any.whl", hash = "sha256:1287bca04e6a5f4094822ac153c03da5e214a0a60bcd557b140f3e66991b8ca1"}, + {file = "marshmallow-3.26.0.tar.gz", hash = "sha256:eb36762a1cc76d7abf831e18a3a1b26d3d481bbc74581b8e532a3d3a8115e1cb"}, ] [package.dependencies] @@ -2087,14 +2088,14 @@ pyasn1 = ">=0.1.3" [[package]] name = "s3transfer" -version = "0.11.1" +version = "0.11.2" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.11.1-py3-none-any.whl", hash = "sha256:8fa0aa48177be1f3425176dfe1ab85dcd3d962df603c3dbfc585e6bf857ef0ff"}, - {file = "s3transfer-0.11.1.tar.gz", hash = "sha256:3f25c900a367c8b7f7d8f9c34edc87e300bde424f779dc9f0a8ae4f9df9264f6"}, + {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, + {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, ] [package.dependencies] @@ -2105,14 +2106,14 @@ crt = ["botocore[crt] (>=1.36.0,<2.0a.0)"] [[package]] name = "selenium" -version = "4.28.0" +version = "4.28.1" description = "Official Python bindings for Selenium WebDriver" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "selenium-4.28.0-py3-none-any.whl", hash = "sha256:3d6a2e8e1b850a1078884ea19f4e011ecdc12263434d87a0b78769836fb82dd8"}, - {file = "selenium-4.28.0.tar.gz", hash = "sha256:a9fae6eef48d470a1b0c6e45185d96f0dafb025e8da4b346cc41e4da3ac54fa0"}, + {file = "selenium-4.28.1-py3-none-any.whl", hash = "sha256:4238847e45e24e4472cfcf3554427512c7aab9443396435b1623ef406fff1cc1"}, + {file = "selenium-4.28.1.tar.gz", hash = "sha256:0072d08670d7ec32db901bd0107695a330cecac9f196e3afb3fa8163026e022a"}, ] [package.dependencies] @@ -2421,14 +2422,14 @@ test = ["pytest"] [[package]] name = "starlette" -version = "0.45.2" +version = "0.45.3" description = "The little ASGI library that shines." optional = false python-versions = ">=3.9" groups = ["docs"] files = [ - {file = "starlette-0.45.2-py3-none-any.whl", hash = "sha256:4daec3356fb0cb1e723a5235e5beaf375d2259af27532958e2d79df549dad9da"}, - {file = "starlette-0.45.2.tar.gz", hash = "sha256:bba1831d15ae5212b22feab2f218bab6ed3cd0fc2dc1d4442443bb1ee52260e0"}, + {file = "starlette-0.45.3-py3-none-any.whl", hash = "sha256:dfb6d332576f136ec740296c7e8bb8c8a7125044e7c6da30744718880cdd059d"}, + {file = "starlette-0.45.3.tar.gz", hash = "sha256:2cbcba2a75806f8a41c722141486f37c28e30a0921c5f6fe4346cb0dcee1302f"}, ] [package.dependencies] diff --git a/src/auto_archiver/base_processors/__init__.py b/src/auto_archiver/base_processors/__init__.py deleted file mode 100644 index 4995457..0000000 --- a/src/auto_archiver/base_processors/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .database import Database -from .enricher import Enricher -from .feeder import Feeder -from .storage import Storage -from .extractor import Extractor -from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index 10213b2..858bdfd 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -8,4 +8,11 @@ from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config \ No newline at end of file +# from .config import Config + +from .database import Database +from .enricher import Enricher +from .feeder import Feeder +from .storage import Storage +from .extractor import Extractor +from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/base_processors/database.py b/src/auto_archiver/core/database.py similarity index 100% rename from src/auto_archiver/base_processors/database.py rename to src/auto_archiver/core/database.py diff --git a/src/auto_archiver/base_processors/enricher.py b/src/auto_archiver/core/enricher.py similarity index 100% rename from src/auto_archiver/base_processors/enricher.py rename to src/auto_archiver/core/enricher.py diff --git a/src/auto_archiver/base_processors/extractor.py b/src/auto_archiver/core/extractor.py similarity index 94% rename from src/auto_archiver/base_processors/extractor.py rename to src/auto_archiver/core/extractor.py index 321b053..8d509ec 100644 --- a/src/auto_archiver/base_processors/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -15,20 +15,16 @@ import mimetypes, requests from loguru import logger from retrying import retry -from ..core import Metadata, ArchivingContext +from ..core import Metadata, ArchivingContext, BaseModule @dataclass -class Extractor: +class Extractor(BaseModule): """ Base class for implementing extractors in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ - def setup(self, *args, **kwargs) -> None: - # used when extractors need to login or do other one-time setup - pass - def cleanup(self) -> None: # called when extractors are done, or upon errors, cleanup any resources pass diff --git a/src/auto_archiver/base_processors/feeder.py b/src/auto_archiver/core/feeder.py similarity index 100% rename from src/auto_archiver/base_processors/feeder.py rename to src/auto_archiver/core/feeder.py diff --git a/src/auto_archiver/base_processors/formatter.py b/src/auto_archiver/core/formatter.py similarity index 100% rename from src/auto_archiver/base_processors/formatter.py rename to src/auto_archiver/core/formatter.py diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 29f9769..3ef43e5 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -153,46 +153,47 @@ class LazyBaseModule: return manifest def load(self): - if self._instance: - return self._instance - # check external dependencies are installed - def check_deps(deps, check): - for dep in deps: - if not check(dep): - logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") - exit(1) - - check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) - check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - + if self._instance: + return self._instance - logger.debug(f"Loading module '{self.display_name}'...") + # check external dependencies are installed + def check_deps(deps, check): + for dep in deps: + if not check(dep): + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + exit(1) - for qualname in [self.name, f'auto_archiver.modules.{self.name}']: - try: - # first import the whole module, to make sure it's working properly - __import__(qualname) - break - except ImportError: - pass + check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) + check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - # then import the file for the entry point - file_name, class_name = self.entry_point.split('::') - sub_qualname = f'{qualname}.{file_name}' - __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) - - # finally, get the class instance - instance = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = self.name - - if not getattr(instance, 'display_name', None): - instance.display_name = self.display_name + logger.debug(f"Loading module '{self.display_name}'...") - self._instance = instance - return instance + for qualname in [self.name, f'auto_archiver.modules.{self.name}']: + try: + # first import the whole module, to make sure it's working properly + __import__(qualname) + break + except ImportError: + pass + + # then import the file for the entry point + file_name, class_name = self.entry_point.split('::') + sub_qualname = f'{qualname}.{file_name}' + + __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) + + # finally, get the class instance + instance = getattr(sys.modules[sub_qualname], class_name)() + if not getattr(instance, 'name', None): + instance.name = self.name + + if not getattr(instance, 'display_name', None): + instance.display_name = self.display_name + + self._instance = instance + return instance def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 967f652..4f155db 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -227,6 +227,10 @@ class ArchivingOrchestrator: continue if loaded_module: step_items.append(loaded_module) + # TODO temp solution + if module_type == "storage": + ArchivingContext.set("storages", step_items, keep_on_reset=True) + check_steps_ok() self.config['steps'][f"{module_type}s"] = step_items @@ -256,10 +260,7 @@ class ArchivingOrchestrator: exit() yaml_config = read_yaml(basic_config.config_file) - - self.setup_complete_parser(basic_config, yaml_config, unused_args) - self.install_modules() # log out the modules that were loaded @@ -301,7 +302,7 @@ class ArchivingOrchestrator: logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}') for d in self.config['steps']['databases']: if type(e) == AssertionError: d.failed(item, str(e)) - else: d.failed(item) + else: d.failed(item, reason="unexpected error") def archive(self, result: Metadata) -> Union[Metadata, None]: @@ -319,27 +320,27 @@ class ArchivingOrchestrator: # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs url = original_url - for a in self.archivers: url = a.sanitize_url(url) + for a in self.config["steps"]["extractors"]: url = a.sanitize_url(url) result.set_url(url) if original_url != url: result.set("original_url", original_url) # 2 - notify start to DBs, propagate already archived if feature enabled in DBs cached_result = None - for d in self.databases: + for d in self.config["steps"]["databases"]: d.started(result) if (local_result := d.fetch(result)): cached_result = (cached_result or Metadata()).merge(local_result) if cached_result: logger.debug("Found previously archived entry") - for d in self.databases: + for d in self.config["steps"]["databases"]: try: d.done(cached_result, cached=True) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") return cached_result - # 3 - call archivers until one succeeds - for a in self.archivers: - logger.info(f"Trying archiver {a.name} for {url}") + # 3 - call extractors until one succeeds + for a in self.config["steps"]["extractors"]: + logger.info(f"Trying extractor {a.name} for {url}") try: result.merge(a.download(result)) if result.is_success(): break @@ -347,7 +348,7 @@ class ArchivingOrchestrator: logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}") # 4 - call enrichers to work with archived content - for e in self.enrichers: + for e in self.config["steps"]["enrichers"]: try: e.enrich(result) except Exception as exc: logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}") @@ -356,7 +357,7 @@ class ArchivingOrchestrator: result.store() # 6 - format and store formatted if needed - if (final_media := self.formatter.format(result)): + if final_media := self.config["steps"]["formatters"][0].format(result): final_media.store(url=url, metadata=result) result.set_final_media(final_media) @@ -364,7 +365,7 @@ class ArchivingOrchestrator: result.status = "nothing archived" # signal completion to databases and archivers - for d in self.databases: + for d in self.config["steps"]["databases"]: try: d.done(result) except Exception as e: logger.error(f"ERROR database {d.name}: {e}: {traceback.format_exc()}") diff --git a/src/auto_archiver/base_processors/storage.py b/src/auto_archiver/core/storage.py similarity index 100% rename from src/auto_archiver/base_processors/storage.py rename to src/auto_archiver/core/storage.py diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index d2b43b7..a893aee 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata @@ -10,7 +10,6 @@ class AAApiDb(Database): """ Connects to auto-archiver-api instance """ - name = "auto_archiver_api_db" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 6a175d3..abc8a1a 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -5,15 +5,11 @@ import requests import hashlib from auto_archiver.core import Media, Metadata -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage from auto_archiver.utils import get_atlos_config_options class AtlosStorage(Storage): - name = "atlos_storage" - - def __init__(self, config: dict) -> None: - super().__init__(config) def get_cdn_url(self, _media: Media) -> str: # It's not always possible to provide an exact URL, because it's diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index 2e24491..c45e215 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -6,7 +6,7 @@ from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options @@ -16,12 +16,6 @@ class AtlosDb(Database): Outputs results to Atlos """ - name = "atlos_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - def failed(self, item: Metadata, reason: str) -> None: """Update DB accordingly for failure""" # If the item has no Atlos ID, there's nothing for us to do diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 262f21b..9811a82 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,13 +1,12 @@ from loguru import logger import requests -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options class AtlosFeeder(Feeder): - name = "atlos_feeder" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 09c46d4..62cb659 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index c581552..48609b0 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 189b137..b5985e2 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 7bff16e..ad0a035 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,14 +1,12 @@ from loguru import logger import csv -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none class CSVFeeder(Feeder): - name = "csv_feeder" - def __iter__(self) -> Metadata: url_column = self.column or 0 for file in self.files: diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 4bcdb90..c2d326d 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -10,11 +10,10 @@ from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request from auto_archiver.core import Media -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage class GDriveStorage(Storage): - name = "gdrive_storage" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 73c264d..d5f363f 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -1,13 +1,13 @@ { - 'name': 'Generic Extractor', - 'version': '0.1.0', - 'author': 'Bellingcat', - 'type': ['extractor'], - 'requires_setup': False, - 'dependencies': { - 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], + "name": "Generic Extractor", + "version": "0.1.0", + "author": "Bellingcat", + "type": ["extractor"], + "requires_setup": False, + "dependencies": { + "python": ["yt_dlp", "requests", "loguru", "slugify"], }, - 'description': """ + "description": """ This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood. This module is responsible for downloading and processing media content from platforms @@ -28,17 +28,53 @@ the broader archiving framework. custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). """, - 'configs': { - "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, - "subtitles": {"default": True, "help": "download subtitles if available"}, - "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, - "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"}, - "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."}, - "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"}, - "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, - 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, - "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, - "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, - "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, - } -} \ No newline at end of file + "configs": { + "facebook_cookie": { + "default": None, + "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'", + }, + "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"}, + "comments": { + "default": False, + "help": "download all comments if available, may lead to large metadata", + "type": "bool", + }, + "livestreams": { + "default": False, + "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control", + "type": "bool", + }, + "live_from_start": { + "default": False, + "help": "if set, will download live streams from their earliest available moment, otherwise starts now.", + "type": "bool", + }, + "proxy": { + "default": "", + "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port", + }, + "end_means_success": { + "default": True, + "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.", + "type": "bool", + }, + "allow_playlist": { + "default": False, + "help": "If True will also download playlists, set to False if the expectation is to download a single video.", + "type": "bool", + }, + "max_downloads": { + "default": "inf", + "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.", + }, + "cookies_from_browser": { + "default": None, + "type": "str", + "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale", + }, + "cookie_file": { + "default": None, + "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp", + }, + }, +} diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index c75c373..1f92fd8 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 99cd71b..c5749ff 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 8ceaabc..57924d9 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,11 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from ...core import Metadata, Media, ArchivingContext class GenericExtractor(Extractor): - name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} def suitable_extractors(self, url: str) -> list[str]: @@ -268,7 +267,7 @@ class GenericExtractor(Extractor): if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - + ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} if item.netloc in ['youtube.com', 'www.youtube.com']: @@ -285,6 +284,6 @@ class GenericExtractor(Extractor): result = self.download_for_extractor(info_extractor, url, ydl) if result: return result - + return False diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index f52a748..e65b4b1 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index 11399d4..83c1f4f 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.base_processors.extractor import Extractor +from auto_archiver.core.extractor import Extractor from .dropin import GenericDropin, InfoExtractor diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 239bc06..e7e8e5c 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -5,7 +5,7 @@ from urllib.parse import quote from loguru import logger -from auto_archiver.base_processors import Database +from auto_archiver.core import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.modules.gsheet_feeder import GWorksheet @@ -15,11 +15,6 @@ class GsheetsDb(Database): NB: only works if GsheetFeeder is used. could be updated in the future to support non-GsheetFeeder metadata """ - name = "gsheet_db" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def started(self, item: Metadata) -> None: logger.warning(f"STARTED {item}") diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index b57174f..66dd014 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -14,13 +14,12 @@ import gspread from loguru import logger from slugify import slugify -from auto_archiver.base_processors import Feeder +from auto_archiver.core import Feeder from auto_archiver.core import Metadata, ArchivingContext from . import GWorksheet class GsheetsFeeder(Feeder): - name = "gsheet_feeder" def __init__(self) -> None: """ diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index 6e3cde3..f306808 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -8,9 +8,9 @@ "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, # TODO add non-negative requirement to match previous implementation? - "chunksize": {"default": 1.6e7, + "chunksize": {"default": 16000000, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB", - 'type': 'positive_number', + 'type': 'int', }, }, "description": """ diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 39ec75c..827b65f 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext @@ -19,6 +19,17 @@ class HashEnricher(Enricher): Calculates hashes for Media instances """ + def __init__(self, config: dict = None): + """ + Initialize the HashEnricher with a configuration dictionary. + """ + super().__init__() + # TODO set these from the manifest? + # Set default values + self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256" + self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7) + + def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})") diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index afa367b..e6e5e58 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -9,24 +9,30 @@ import base64 from auto_archiver.version import __version__ from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.base_processors import Formatter +from auto_archiver.core import Formatter from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.utils.misc import random_str @dataclass class HtmlFormatter(Formatter): + environment: Environment = None + template: any = None - # TODO: fix setting up template with new config method - # def __init__(self, config: dict) -> None: - # # without this STEP.__init__ is not called - # super().__init__(config) - # self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True) - # # JinjaHelper class static methods are added as filters - # self.environment.filters.update({ - # k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) - # }) - # self.template = self.environment.get_template("html_template.html") + def setup(self, config: dict) -> None: + """Sets up the Jinja2 environment and loads the template.""" + super().setup(config) # Ensure the base class logic is executed + template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/") + self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + + # JinjaHelper class static methods are added as filters + self.environment.filters.update({ + k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) + }) + + # Load a specific template or default to "html_template.html" + template_name = self.config.get("template_name", "html_template.html") + self.template = self.environment.get_template(template_name) def format(self, item: Metadata) -> Media: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py index c1271fc..3d7f9e5 100644 --- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py @@ -16,7 +16,7 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata @@ -28,8 +28,6 @@ class InstagramAPIExtractor(Extractor): # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ - name = "instagram_api_extractor" - global_pattern = re.compile( r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" ) diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py index 2b9bece..1a246fb 100644 --- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py @@ -7,7 +7,7 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media @@ -15,8 +15,6 @@ class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ - name = "instagram_extractor" - # NB: post regex should be tested before profile # https://regex101.com/r/MGPquX/1 post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)") diff --git a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py index 36c8a06..60fa397 100644 --- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py @@ -15,7 +15,7 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str @@ -26,13 +26,6 @@ class InstagramTbotExtractor(Extractor): https://github.com/adw0rd/instagrapi https://t.me/instagram_load_bot """ - name = "instagram_tbot_extractor" - - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("api_id") - self.assert_valid_string("api_hash") - self.timeout = int(self.timeout) def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 5d65414..4c44e9c 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -5,17 +5,12 @@ import os from loguru import logger from auto_archiver.core import Media -from auto_archiver.base_processors import Storage +from auto_archiver.core import Storage class LocalStorage(Storage): name = "local_storage" - def __init__(self) -> None: - super().__init__() - # TODO: fix up passing config values to 'steps' - # os.makedirs(self.save_to, exist_ok=True) - def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index fa86818..03fb01e 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 20a278f..c052d0a 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/mute_formatter/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py index addb454..1c7cca2 100644 --- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py +++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py @@ -2,11 +2,10 @@ from __future__ import annotations from dataclasses import dataclass from auto_archiver.core import Metadata, Media -from auto_archiver.base_processors import Formatter +from auto_archiver.core import Formatter @dataclass class MuteFormatter(Formatter): - name = "mute_formatter" def format(self, item: Metadata) -> Media: return None diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index 65b0e59..e812e8b 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index a637259..10d5f61 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -4,14 +4,13 @@ import boto3, os from auto_archiver.utils.misc import random_str from auto_archiver.core import Media -from auto_archiver.base_processors import Storage -# TODO +from auto_archiver.core import Storage + from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger NO_DUPLICATES_FOLDER = "no-dups/" class S3Storage(Storage): - name = "s3_storage" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 0140875..be775ce 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,15 +5,11 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext class ScreenshotEnricher(Enricher): - name = "screenshot_enricher" - - def __init__(self, config: dict) -> None: - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index d15ee95..52237ee 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index aa7e46f..d612e24 100644 --- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,7 +2,7 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media diff --git a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py index 8b49a10..f378e7e 100644 --- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py @@ -8,21 +8,15 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str class TelethonArchiver(Extractor): - name = "telethon_extractor" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") - def __init__(self, config: dict) -> None: - super().__init__(config) - self.assert_valid_string("api_id") - self.assert_valid_string("api_hash") - def setup(self) -> None: """ diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 4a5a1db..b27243b 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str @@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher): logger.error(f"error getting duration of video {m.filename}: {e}") return - num_thumbs = int(min(max(1, duration * self.thumbnails_per_second), self.max_thumbnails)) + num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails)) timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)] thumbnails_media = [] diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index c90d42c..a7a0aee 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor class TimestampingEnricher(Enricher): @@ -21,10 +21,6 @@ class TimestampingEnricher(Enricher): See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities. """ - name = "timestamping_enricher" - - def __init__(self, config: dict) -> None: - super().__init__(config) def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py index ea669b4..6a4930a 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py @@ -8,11 +8,10 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata,Media class TwitterApiExtractor(Extractor): - name = "twitter_api_extractor" link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/modules/vk_extractor/vk_extractor.py b/src/auto_archiver/modules/vk_extractor/vk_extractor.py index eb4c171..1bce167 100644 --- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py +++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py @@ -2,7 +2,7 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.base_processors import Extractor +from auto_archiver.core import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext @@ -11,7 +11,6 @@ class VkExtractor(Extractor): VK videos are handled by YTDownloader, this archiver gets posts text and images. Currently only works for /wall posts """ - name = "vk_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 9ba43ae..1eb7398 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,7 +6,7 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.base_processors import Extractor, Enricher +from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str @@ -17,11 +17,6 @@ class WaczExtractorEnricher(Enricher, Extractor): it can become quite powerful for archiving private content. When used as an archiver it will extract the media from the .WACZ archive so it can be enriched. """ - name = "wacz_archiver_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) def setup(self) -> None: self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER') diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 6942727..0e25440 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,7 +2,7 @@ import json from loguru import logger import time, requests -from auto_archiver.base_processors import Extractor, Enricher +from auto_archiver.core import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata @@ -12,13 +12,6 @@ class WaybackExtractorEnricher(Enricher, Extractor): The Wayback machine will rate-limit IP heavy usage. """ - name = "wayback_archiver_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key" - assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret" def download(self, item: Metadata) -> Metadata: # this new Metadata object is required to avoid duplication diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index d14c537..09eb3db 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,7 +2,7 @@ import traceback import requests, time from loguru import logger -from auto_archiver.base_processors import Enricher +from auto_archiver.core import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.modules.s3_storage import S3Storage @@ -13,14 +13,6 @@ class WhisperEnricher(Enricher): whisper API repository: https://github.com/bellingcat/whisperbox-transcribe/ Only works if an S3 compatible storage is used """ - name = "whisper_enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint" - assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key" - self.timeout = int(self.timeout) def enrich(self, to_enrich: Metadata) -> None: if not self._get_s3_storage(): diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index 485344f..7a8862f 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -1,9 +1,9 @@ import json, gspread -from ..core import Step +from ..core import BaseModule -class Gsheets(Step): +class Gsheets(BaseModule): name = "gsheets" def __init__(self, config: dict) -> None: