diff --git a/Dockerfile b/Dockerfile index 0ecc7f3..8272c73 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM webrecorder/browsertrix-crawler:1.0.4 AS base +FROM webrecorder/browsertrix-crawler:1.4.2 AS base ENV RUNNING_IN_DOCKER=1 \ LANG=C.UTF-8 \ @@ -22,28 +22,30 @@ RUN add-apt-repository ppa:mozillateam/ppa && \ # Poetry and runtime -FROM base AS runtime +FROM base AS poetry-env ENV POETRY_NO_INTERACTION=1 \ POETRY_VIRTUALENVS_IN_PROJECT=1 \ POETRY_VIRTUALENVS_CREATE=1 -RUN pip install --upgrade pip && \ - pip install "poetry>=2.0.0,<3.0.0" +# Create a virtual environment for poetry and install it +RUN python3 -m venv /poetry-venv && \ + /poetry-venv/bin/python -m pip install --upgrade pip && \ + /poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0" WORKDIR /app COPY pyproject.toml poetry.lock README.md ./ # Copy dependency files and install dependencies (excluding the package itself) -RUN poetry install --only main --no-root --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache # Copy code: This is needed for poetry to install the package itself, # but the environment should be cached from the previous step if toml and lock files haven't changed COPY ./src/ . -RUN poetry install --only main --no-cache +RUN /poetry-venv/bin/poetry install --only main --no-cache # Update PATH to include virtual environment binaries @@ -55,4 +57,3 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"] # should be executed with 2 volumes (3 if local_storage is used) # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml - diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py deleted file mode 100644 index 54515ec..0000000 --- a/src/auto_archiver/archivers/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -Archivers are responsible for retrieving the content from various external platforms. -They act as specialized modules, each tailored to interact with a specific platform, -service, or data source. The archivers collectively enable the tool to comprehensively -collect and preserve a variety of content types, such as posts, images, videos and metadata. - -""" -from .archiver import Archiver diff --git a/src/auto_archiver/base_modules/__init__.py b/src/auto_archiver/base_modules/__init__.py new file mode 100644 index 0000000..4995457 --- /dev/null +++ b/src/auto_archiver/base_modules/__init__.py @@ -0,0 +1,6 @@ +from .database import Database +from .enricher import Enricher +from .feeder import Feeder +from .storage import Storage +from .extractor import Extractor +from .formatter import Formatter \ No newline at end of file diff --git a/src/auto_archiver/databases/database.py b/src/auto_archiver/base_modules/database.py similarity index 96% rename from src/auto_archiver/databases/database.py rename to src/auto_archiver/base_modules/database.py index 30cba7e..28f0061 100644 --- a/src/auto_archiver/databases/database.py +++ b/src/auto_archiver/base_modules/database.py @@ -3,13 +3,13 @@ from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from ..core import Metadata, Step +from auto_archiver.core import Metadata, Step @dataclass class Database(Step, ABC): - name = "database" + name = "database" def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) diff --git a/src/auto_archiver/base_modules/enricher.py b/src/auto_archiver/base_modules/enricher.py new file mode 100644 index 0000000..d26eedf --- /dev/null +++ b/src/auto_archiver/base_modules/enricher.py @@ -0,0 +1,31 @@ +""" +Enrichers are modular components that enhance archived content by adding +context, metadata, or additional processing. + +These add additional information to the context, such as screenshots, hashes, and metadata. +They are designed to work within the archiving pipeline, operating on `Metadata` objects after +the archiving step and before storage or formatting. + +Enrichers are optional but highly useful for making the archived data more powerful. +""" +from __future__ import annotations +from dataclasses import dataclass +from abc import abstractmethod, ABC +from auto_archiver.core import Metadata, Step + +@dataclass +class Enricher(Step, ABC): + """Base classes and utilities for enrichers in the Auto-Archiver system.""" + name = "enricher" + + def __init__(self, config: dict) -> None: + # without this STEP.__init__ is not called + super().__init__(config) + + + # only for typing... + def init(name: str, config: dict) -> Enricher: + return Step.init(name, config, Enricher) + + @abstractmethod + def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/base_modules/extractor.py similarity index 81% rename from src/auto_archiver/archivers/archiver.py rename to src/auto_archiver/base_modules/extractor.py index b5f3f40..c772325 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/base_modules/extractor.py @@ -1,7 +1,7 @@ -""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework. - This class provides common utility methods and a standard interface for archivers. +""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework. + This class provides common utility methods and a standard interface for extractors. - Factory method to initialize an archiver instance based on its name. + Factory method to initialize an extractor instance based on its name. """ @@ -15,32 +15,32 @@ import mimetypes, requests from loguru import logger from retrying import retry -from ..core import Metadata, Step, ArchivingContext +from ..core import Metadata, ArchivingContext @dataclass -class Archiver: +class Extractor: """ - Base class for implementing archivers in the media archiving framework. + Base class for implementing extractors in the media archiving framework. Subclasses must implement the `download` method to define platform-specific behavior. """ def setup(self) -> None: - # used when archivers need to login or do other one-time setup + # used when extractors need to login or do other one-time setup pass def cleanup(self) -> None: - # called when archivers are done, or upon errors, cleanup any resources + # called when extractors are done, or upon errors, cleanup any resources pass def sanitize_url(self, url: str) -> str: # used to clean unnecessary URL parameters OR unfurl redirect links return url - + def suitable(self, url: str) -> bool: """ - Returns True if this archiver can handle the given URL - + Returns True if this extractor can handle the given URL + Should be overridden by subclasses """ return True @@ -84,10 +84,10 @@ class Archiver: for chunk in d.iter_content(chunk_size=8192): f.write(chunk) return to_filename - + except requests.RequestException as e: logger.warning(f"Failed to fetch the Media URL: {e}") @abstractmethod def download(self, item: Metadata) -> Metadata: - pass + pass \ No newline at end of file diff --git a/src/auto_archiver/feeders/feeder.py b/src/auto_archiver/base_modules/feeder.py similarity index 86% rename from src/auto_archiver/feeders/feeder.py rename to src/auto_archiver/base_modules/feeder.py index 4aa263f..7fbd6b1 100644 --- a/src/auto_archiver/feeders/feeder.py +++ b/src/auto_archiver/base_modules/feeder.py @@ -1,8 +1,8 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from ..core import Metadata -from ..core import Step +from auto_archiver.core import Metadata +from auto_archiver.core import Step @dataclass diff --git a/src/auto_archiver/formatters/formatter.py b/src/auto_archiver/base_modules/formatter.py similarity index 90% rename from src/auto_archiver/formatters/formatter.py rename to src/auto_archiver/base_modules/formatter.py index b10477e..4c59af8 100644 --- a/src/auto_archiver/formatters/formatter.py +++ b/src/auto_archiver/base_modules/formatter.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from ..core import Metadata, Media, Step +from auto_archiver.core import Metadata, Media, Step @dataclass diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/base_modules/storage.py similarity index 94% rename from src/auto_archiver/storages/storage.py rename to src/auto_archiver/base_modules/storage.py index c9b55e0..147da1f 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/base_modules/storage.py @@ -4,10 +4,10 @@ from dataclasses import dataclass from typing import IO, Optional import os -from ..utils.misc import random_str +from auto_archiver.utils.misc import random_str -from ..core import Media, Step, ArchivingContext, Metadata -from ..enrichers import HashEnricher +from auto_archiver.core import Media, Step, ArchivingContext, Metadata +from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from loguru import logger from slugify import slugify diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index ad3f989..b78df83 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -8,9 +8,4 @@ from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator -# from .config import Config - -from .media import Media -from .step import Step -from .context import ArchivingContext -from .metadata import Metadata +# from .config import Config \ No newline at end of file diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f5d9fae..3ec0b38 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -15,7 +15,7 @@ from .loader import MODULE_TYPES # configurable_parents = [ # Feeder, # Enricher, -# Archiver, +# Extractor, # Database, # Storage, # Formatter @@ -23,7 +23,7 @@ from .loader import MODULE_TYPES # ] # feeder: Feeder # formatter: Formatter -# archivers: List[Archiver] = field(default_factory=[]) +# extractors: List[Extractor] = field(default_factory=[]) # enrichers: List[Enricher] = field(default_factory=[]) # storages: List[Storage] = field(default_factory=[]) # databases: List[Database] = field(default_factory=[]) diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 1b4fee0..38edafe 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -33,7 +33,7 @@ class ArchivingOrchestrator: # self.feeder: Feeder = config.feeder # self.formatter: Formatter = config.formatter # self.enrichers: List[Enricher] = config.enrichers - # self.archivers: List[Archiver] = config.archivers + # self.extractors: List[Extractor] = config.extractors # self.databases: List[Database] = config.databases # self.storages: List[Storage] = config.storages # ArchivingContext.set("storages", self.storages, keep_on_reset=True) @@ -80,7 +80,7 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) - # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' + # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter' for module_type in MODULE_TYPES: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) @@ -98,7 +98,7 @@ class ArchivingOrchestrator: self.add_module_args(available_modules(with_manifest=True), parser) - breakpoint() + # breakpoint() parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -165,7 +165,8 @@ class ArchivingOrchestrator: for module_type in MODULE_TYPES: if module_type == 'enricher': - breakpoint() + pass + # breakpoint() step_items = [] modules_to_load = self.config['steps'][f"{module_type}s"] @@ -228,7 +229,7 @@ class ArchivingOrchestrator: def cleanup(self)->None: logger.info("Cleaning up") for e in self.config['steps']['extractors']: - breakpoint() + # breakpoint() e.cleanup() def feed(self) -> Generator[Metadata]: diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py deleted file mode 100644 index 3a8d787..0000000 --- a/src/auto_archiver/databases/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" Databases are used to store the outputs from running the Autp Archiver. - - -""" -from .database import Database diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py deleted file mode 100644 index 67cb0e5..0000000 --- a/src/auto_archiver/enrichers/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Enrichers are modular components that enhance archived content by adding -context, metadata, or additional processing. - -These add additional information to the context, such as screenshots, hashes, and metadata. -They are designed to work within the archiving pipeline, operating on `Metadata` objects after -the archiving step and before storage or formatting. - -Enrichers are optional but highly useful for making the archived data more powerful. - - -""" diff --git a/src/auto_archiver/enrichers/enricher.py b/src/auto_archiver/enrichers/enricher.py deleted file mode 100644 index f195f23..0000000 --- a/src/auto_archiver/enrichers/enricher.py +++ /dev/null @@ -1,22 +0,0 @@ -""" Base classes and utilities for enrichers in the Auto-Archiver system. -""" -from __future__ import annotations -from dataclasses import dataclass -from abc import abstractmethod, ABC -from ..core import Metadata, Step - -@dataclass -class Enricher(Step, ABC): - name = "enricher" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - - # only for typing... - def init(name: str, config: dict) -> Enricher: - return Step.init(name, config, Enricher) - - @abstractmethod - def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py deleted file mode 100644 index 3eb33d7..0000000 --- a/src/auto_archiver/feeders/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" Feeders handle the input of media into the Auto Archiver. - -""" diff --git a/src/auto_archiver/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py deleted file mode 100644 index 1a9dcd0..0000000 --- a/src/auto_archiver/formatters/__init__.py +++ /dev/null @@ -1 +0,0 @@ -""" Formatters for the output of the content. """ diff --git a/src/auto_archiver/modules/api_db/api_db.py b/src/auto_archiver/modules/api_db/api_db.py index fa1ae75..44373c6 100644 --- a/src/auto_archiver/modules/api_db/api_db.py +++ b/src/auto_archiver/modules/api_db/api_db.py @@ -2,7 +2,7 @@ from typing import Union import requests, os from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/formatters/templates/__init__.py b/src/auto_archiver/modules/atlos/__init__.py similarity index 100% rename from src/auto_archiver/formatters/templates/__init__.py rename to src/auto_archiver/modules/atlos/__init__.py diff --git a/src/auto_archiver/modules/atlos/__manifest__.py b/src/auto_archiver/modules/atlos/__manifest__.py new file mode 100644 index 0000000..cc357e3 --- /dev/null +++ b/src/auto_archiver/modules/atlos/__manifest__.py @@ -0,0 +1,38 @@ +{ + "name": "atlos_storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": ["loguru", "requests"], + "bin": [""] + }, + "configs": { + # TODO: get base storage configs + # TODO also? get_atlos_config_options() + + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + }, + "description": """ + AtlosStorage: A storage module for saving media files to the Atlos platform. + + ### Features + - Uploads media files to Atlos using Atlos-specific APIs. + - Automatically calculates SHA-256 hashes of media files for integrity verification. + - Skips uploads for files that already exist on Atlos with the same hash. + - Supports attaching metadata, such as `atlos_id`, to the uploaded files. + - Provides CDN-like URLs for accessing uploaded media. + + ### Notes + - Requires Atlos API configuration, including `atlos_url` and `api_token`. + - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials. + """ +} diff --git a/src/auto_archiver/storages/atlos.py b/src/auto_archiver/modules/atlos/atlos.py similarity index 94% rename from src/auto_archiver/storages/atlos.py rename to src/auto_archiver/modules/atlos/atlos.py index 3b13aa0..28b7cb1 100644 --- a/src/auto_archiver/storages/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -4,9 +4,9 @@ from loguru import logger import requests import hashlib -from ..core import Media, Metadata -from ..storages import Storage -from ..utils import get_atlos_config_options +from auto_archiver.core import Media, Metadata +from auto_archiver.base_modules import Storage +from auto_archiver.utils import get_atlos_config_options class AtlosStorage(Storage): diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index 376ba32..cbf1c89 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -1,11 +1,12 @@ import os + from typing import Union from loguru import logger from csv import DictWriter from dataclasses import asdict import requests -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/atlos_db/base_configs.py b/src/auto_archiver/modules/atlos_db/base_configs.py new file mode 100644 index 0000000..c47c711 --- /dev/null +++ b/src/auto_archiver/modules/atlos_db/base_configs.py @@ -0,0 +1,13 @@ +def get_atlos_config_options(): + return { + "api_token": { + "default": None, + "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/", + "cli_set": lambda cli_val, _: cli_val + }, + "atlos_url": { + "default": "https://platform.atlos.org", + "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.", + "cli_set": lambda cli_val, _: cli_val + }, + } \ No newline at end of file diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index d344139..0810b73 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import requests -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import get_atlos_config_options diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 1376379..e826533 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/console_db/console_db.py b/src/auto_archiver/modules/console_db/console_db.py index 357c696..a0d43b7 100644 --- a/src/auto_archiver/modules/console_db/console_db.py +++ b/src/auto_archiver/modules/console_db/console_db.py @@ -1,6 +1,6 @@ from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_db/csv_db.py b/src/auto_archiver/modules/csv_db/csv_db.py index 642e889..6e5d873 100644 --- a/src/auto_archiver/modules/csv_db/csv_db.py +++ b/src/auto_archiver/modules/csv_db/csv_db.py @@ -3,7 +3,7 @@ from loguru import logger from csv import DictWriter from dataclasses import asdict -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index b665bd9..4cf2f11 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -1,7 +1,7 @@ from loguru import logger import csv -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import url_or_none diff --git a/src/auto_archiver/modules/gsheet_db/__init__.py b/src/auto_archiver/modules/gdrive_storage/__init__.py similarity index 100% rename from src/auto_archiver/modules/gsheet_db/__init__.py rename to src/auto_archiver/modules/gdrive_storage/__init__.py diff --git a/src/auto_archiver/modules/gdrive_storage/__manifest__.py b/src/auto_archiver/modules/gdrive_storage/__manifest__.py new file mode 100644 index 0000000..cc598e2 --- /dev/null +++ b/src/auto_archiver/modules/gdrive_storage/__manifest__.py @@ -0,0 +1,34 @@ +m = { + "name": "Google Drive Storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": [ + "loguru", + "google-api-python-client", + "google-auth", + "google-auth-oauthlib", + "google-auth-httplib2" + ], + }, + "configs": { + # TODO: get base storage configs + "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, + "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, + }, + "description": """ + GDriveStorage: A storage module for saving archived content to Google Drive. + + ### Features + - Saves media files to Google Drive, organizing them into folders based on the provided path structure. + - Supports OAuth token-based authentication or service account credentials for API access. + - Automatically creates folders in Google Drive if they don't exist. + - Retrieves CDN URLs for stored files, enabling easy sharing and access. + + ### Notes + - Requires setup with either a Google OAuth token or a service account JSON file. + - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure. + - Automatically handles Google Drive API token refreshes for long-running jobs. + """ +} diff --git a/src/auto_archiver/storages/gd.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py similarity index 99% rename from src/auto_archiver/storages/gd.py rename to src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 61c5b21..2e4ca48 100644 --- a/src/auto_archiver/storages/gd.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -9,8 +9,8 @@ from google.oauth2 import service_account from google.oauth2.credentials import Credentials from google.auth.transport.requests import Request -from ..core import Media -from . import Storage +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage class GDriveStorage(Storage): diff --git a/src/auto_archiver/modules/generic_extractor/bluesky.py b/src/auto_archiver/modules/generic_extractor/bluesky.py index 7aa9c39..d4051aa 100644 --- a/src/auto_archiver/modules/generic_extractor/bluesky.py +++ b/src/auto_archiver/modules/generic_extractor/bluesky.py @@ -1,17 +1,12 @@ -import os -import mimetypes - -import requests from loguru import logger -from auto_archiver.core.context import ArchivingContext -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from auto_archiver.core.metadata import Metadata, Media from .dropin import GenericDropin, InfoExtractor class Bluesky(GenericDropin): - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() result.set_url(url) result.set_title(post["record"]["text"]) @@ -42,7 +37,7 @@ class Bluesky(GenericDropin): - def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]: + def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]: """ Iterates over image(s) or video in a Bluesky post and downloads them """ diff --git a/src/auto_archiver/modules/generic_extractor/dropin.py b/src/auto_archiver/modules/generic_extractor/dropin.py index 37f3faf..9de63d2 100644 --- a/src/auto_archiver/modules/generic_extractor/dropin.py +++ b/src/auto_archiver/modules/generic_extractor/dropin.py @@ -1,6 +1,6 @@ from yt_dlp.extractor.common import InfoExtractor from auto_archiver.core.metadata import Metadata -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor class GenericDropin: """Base class for dropins for the generic extractor. @@ -30,7 +30,7 @@ class GenericDropin: raise NotImplementedError("This method should be implemented in the subclass") - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: """ This method should create a Metadata object from the post data. """ diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 27fe157..8e4b2c4 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -5,10 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor from loguru import logger -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from ...core import Metadata, Media, ArchivingContext -class GenericExtractor(Archiver): +class GenericExtractor(Extractor): name = "youtubedl_archiver" #left as is for backwards compat _dropins = {} diff --git a/src/auto_archiver/modules/generic_extractor/truth.py b/src/auto_archiver/modules/generic_extractor/truth.py index bf19dce..e713c90 100644 --- a/src/auto_archiver/modules/generic_extractor/truth.py +++ b/src/auto_archiver/modules/generic_extractor/truth.py @@ -2,7 +2,7 @@ from typing import Type from auto_archiver.utils import traverse_obj from auto_archiver.core.metadata import Metadata, Media -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from yt_dlp.extractor.common import InfoExtractor from dateutil.parser import parse as parse_dt @@ -19,7 +19,7 @@ class Truth(GenericDropin): def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool: return True - def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: """ Creates metadata from a truth social post diff --git a/src/auto_archiver/modules/generic_extractor/twitter.py b/src/auto_archiver/modules/generic_extractor/twitter.py index ce6c28d..6cd22b1 100644 --- a/src/auto_archiver/modules/generic_extractor/twitter.py +++ b/src/auto_archiver/modules/generic_extractor/twitter.py @@ -6,7 +6,7 @@ from slugify import slugify from auto_archiver.core.metadata import Metadata, Media from auto_archiver.utils import UrlUtil -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor from .dropin import GenericDropin, InfoExtractor @@ -32,7 +32,7 @@ class Twitter(GenericDropin): twid = ie_instance._match_valid_url(url).group('id') return ie_instance._extract_status(twid=twid) - def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata: + def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata: result = Metadata() try: if not tweet.get("user") or not tweet.get("created_at"): diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py deleted file mode 100644 index f4db93b..0000000 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ /dev/null @@ -1,21 +0,0 @@ -# TODO merge with feeder manifest? -{ - "name": "gsheet_db", - "type": ["database"], - "requires_setup": True, - "external_dependencies": {"python": [" loguru"], - }, - "description": """ -Handles integration with Google Sheets for tracking archival tasks. - -### Features -- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. -- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. -- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. -- Skips redundant updates for empty or invalid data fields. - -### Notes -- Currently works only with metadata provided by GsheetFeeder. -- Requires configuration of a linked Google Sheet and appropriate API credentials. -""", -} diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_processor/__init__.py similarity index 100% rename from src/auto_archiver/modules/gsheet_feeder/__init__.py rename to src/auto_archiver/modules/gsheet_processor/__init__.py diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_processor/__manifest__.py similarity index 65% rename from src/auto_archiver/modules/gsheet_feeder/__manifest__.py rename to src/auto_archiver/modules/gsheet_processor/__manifest__.py index 2af090c..8a554fe 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_processor/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "Google Sheets Feeder", + "name": "Google Sheets Procesor", "type": ["feeder"], "requires_setup": True, "external_dependencies": { @@ -22,7 +22,12 @@ } }, "description": """ - GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver. + Google Sheets Module. + + Handles feeding from a google sheet as well as an optional write back to the sheet. + + ## GsheetsFeeder + A Google Sheets-based feeder for the Auto Archiver. This reads data from Google Sheets and filters rows based on user-defined rules. The filtered rows are processed into `Metadata` objects. @@ -36,5 +41,18 @@ ### Notes - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`. - Create the sheet using the template provided in the docs. + + ## GsheetsDatabase: + Handles integration with Google Sheets for tracking archival tasks. + +### Features +- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used. +- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns. +- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet. +- Skips redundant updates for empty or invalid data fields. + +### Notes +- Currently works only with metadata provided by GsheetFeeder. +- Requires configuration of a linked Google Sheet and appropriate API credentials. """ } diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_processor/gsheet_db.py similarity index 98% rename from src/auto_archiver/modules/gsheet_db/gsheet_db.py rename to src/auto_archiver/modules/gsheet_processor/gsheet_db.py index 8e17966..cf46473 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_processor/gsheet_db.py @@ -1,10 +1,11 @@ from typing import Union, Tuple + import datetime from urllib.parse import quote from loguru import logger -from auto_archiver.databases import Database +from auto_archiver.base_modules import Database from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import GWorksheet diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py similarity index 98% rename from src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py rename to src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py index 5c73bf6..4df9042 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py @@ -13,8 +13,7 @@ import gspread, os from loguru import logger from slugify import slugify -# from . import Enricher -from auto_archiver.feeders import Feeder +from auto_archiver.base_modules import Feeder from auto_archiver.core import Metadata, ArchivingContext from auto_archiver.utils import Gsheets, GWorksheet diff --git a/src/auto_archiver/modules/hash_enricher/__init__.py b/src/auto_archiver/modules/hash_enricher/__init__.py index e69de29..e7faff7 100644 --- a/src/auto_archiver/modules/hash_enricher/__init__.py +++ b/src/auto_archiver/modules/hash_enricher/__init__.py @@ -0,0 +1 @@ +from hash_enricher import HashEnricher \ No newline at end of file diff --git a/src/auto_archiver/modules/hash_enricher/__manifest__.py b/src/auto_archiver/modules/hash_enricher/__manifest__.py index 311ed6f..eef1963 100644 --- a/src/auto_archiver/modules/hash_enricher/__manifest__.py +++ b/src/auto_archiver/modules/hash_enricher/__manifest__.py @@ -7,7 +7,7 @@ }, "configs": { "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, - "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, + "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, }, "description": """ Generates cryptographic hashes for media files to ensure data integrity and authenticity. diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 355413a..c8eacb1 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -10,7 +10,7 @@ making it suitable for handling large files efficiently. import hashlib from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext @@ -40,7 +40,11 @@ class HashEnricher(Enricher): else: self.chunksize = self.configs()["chunksize"]["default"] - self.chunksize = int(self.chunksize) + try: + self.chunksize = int(self.chunksize) + except ValueError: + raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.") + assert self.chunksize >= -1, "read length must be non-negative or -1" ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True) diff --git a/src/auto_archiver/modules/instagram_api_archiver/__init__.py b/src/auto_archiver/modules/html_formatter/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_api_archiver/__init__.py rename to src/auto_archiver/modules/html_formatter/__init__.py diff --git a/src/auto_archiver/modules/html_formatter/__manifest__.py b/src/auto_archiver/modules/html_formatter/__manifest__.py new file mode 100644 index 0000000..55ca5da --- /dev/null +++ b/src/auto_archiver/modules/html_formatter/__manifest__.py @@ -0,0 +1,13 @@ +m = { + "name": "HTML Formatter", + "type": ["formatter"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru", "jinja2"], + "bin": [""] + }, + "configs": { + "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} + }, + "description": """ """, +} diff --git a/src/auto_archiver/formatters/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py similarity index 84% rename from src/auto_archiver/formatters/html_formatter.py rename to src/auto_archiver/modules/html_formatter/html_formatter.py index 5d95474..cc8a4da 100644 --- a/src/auto_archiver/formatters/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -7,11 +7,11 @@ from loguru import logger import json import base64 -from ..version import __version__ -from ..core import Metadata, Media, ArchivingContext -from . import Formatter -from ..enrichers import HashEnricher -from ..utils.misc import random_str +from auto_archiver.version import __version__ +from auto_archiver.core import Metadata, Media, ArchivingContext +from auto_archiver.base_modules import Formatter +from auto_archiver.modules.hash_enricher import HashEnricher +from auto_archiver.utils.misc import random_str @dataclass @@ -28,11 +28,11 @@ class HtmlFormatter(Formatter): }) self.template = self.environment.get_template("html_template.html") - @staticmethod - def configs() -> dict: - return { - "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} - } + # @staticmethod + # def configs() -> dict: + # return { + # "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} + # } def format(self, item: Metadata) -> Media: url = item.get_url() diff --git a/src/auto_archiver/modules/instagram_archiver/__init__.py b/src/auto_archiver/modules/html_formatter/templates/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_archiver/__init__.py rename to src/auto_archiver/modules/html_formatter/templates/__init__.py diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/modules/html_formatter/templates/html_template.html similarity index 100% rename from src/auto_archiver/formatters/templates/html_template.html rename to src/auto_archiver/modules/html_formatter/templates/html_template.html diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/modules/html_formatter/templates/macros.html similarity index 100% rename from src/auto_archiver/formatters/templates/macros.html rename to src/auto_archiver/modules/html_formatter/templates/macros.html diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__init__.py b/src/auto_archiver/modules/instagram_api_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/instagram_tbot_archiver/__init__.py rename to src/auto_archiver/modules/instagram_api_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py similarity index 95% rename from src/auto_archiver/modules/instagram_api_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_api_extractor/__manifest__.py index b2225fa..cdaf635 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_api_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram API Archiver", + "name": "Instagram API Extractor", "type": ["extractor"], - "entry_point": "instagram_api_archiver:InstagramApiArchiver", "external_dependencies": {"python": ["requests", "loguru", diff --git a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py similarity index 98% rename from src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py rename to src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py index dc3f1ec..5206b41 100644 --- a/src/auto_archiver/modules/instagram_api_archiver/instagram_api_archiver.py +++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py @@ -1,5 +1,5 @@ """ -The `instagram_api_archiver` module provides tools for archiving various types of Instagram content +The `instagram_api_extractor` module provides tools for archiving various types of Instagram content using the [Instagrapi API](https://github.com/subzeroid/instagrapi). Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles, @@ -16,19 +16,19 @@ from loguru import logger from retrying import retry from tqdm import tqdm -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Media from auto_archiver.core import Metadata -class InstagramAPIArchiver(Archiver): +class InstagramAPIExtractor(Extractor): """ Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data # TODO: improvement collect aggregates of locations[0].location and mentions for all posts """ - name = "instagram_api_archiver" + name = "instagram_api_extractor" global_pattern = re.compile( r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?" diff --git a/src/auto_archiver/modules/telegram_archiver/__init__.py b/src/auto_archiver/modules/instagram_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/telegram_archiver/__init__.py rename to src/auto_archiver/modules/instagram_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_extractor/__manifest__.py similarity index 93% rename from src/auto_archiver/modules/instagram_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_extractor/__manifest__.py index 44cd7bb..f1857c2 100644 --- a/src/auto_archiver/modules/instagram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram Archiver", + "name": "Instagram Extractor", "type": ["extractor"], - "entry_point": "instagram_archiver:InstagramArchiver", "external_dependencies": { "python": [ "instaloader", diff --git a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py similarity index 96% rename from src/auto_archiver/modules/instagram_archiver/instagram_archiver.py rename to src/auto_archiver/modules/instagram_extractor/instagram_archiver.py index 7daf291..c6bde62 100644 --- a/src/auto_archiver/modules/instagram_archiver/instagram_archiver.py +++ b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py @@ -7,15 +7,15 @@ import re, os, shutil, traceback import instaloader # https://instaloader.github.io/as-module.html from loguru import logger -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata from auto_archiver.core import Media -class InstagramArchiver(Archiver): +class InstagramExtractor(Extractor): """ Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...) """ - name = "instagram_archiver" + name = "instagram_extractor" # NB: post regex should be tested before profile # https://regex101.com/r/MGPquX/1 @@ -67,7 +67,7 @@ class InstagramArchiver(Archiver): elif len(profile_matches): result = self.download_profile(url, profile_matches[0]) except Exception as e: - logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.") + logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.") finally: shutil.rmtree(self.download_folder, ignore_errors=True) return result diff --git a/src/auto_archiver/modules/telethon_archiver/__init__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__init__.py similarity index 100% rename from src/auto_archiver/modules/telethon_archiver/__init__.py rename to src/auto_archiver/modules/instagram_tbot_extractor/__init__.py diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py similarity index 82% rename from src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py rename to src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py index 6e934b0..95d6808 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/__manifest__.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Instagram Telegram Bot Archiver", + "name": "Instagram Telegram Bot Extractor", "type": ["extractor"], - "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver", "external_dependencies": {"python": ["loguru", "telethon",], }, @@ -13,7 +12,7 @@ "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."}, }, "description": """ -The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, +The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content, such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and returned as part of a `Metadata` object. @@ -26,7 +25,7 @@ returned as part of a `Metadata` object. ### Setup -To use the `InstagramTbotArchiver`, you need to provide the following configuration settings: +To use the `InstagramTbotExtractor`, you need to provide the following configuration settings: - **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps). - **Session File**: Optional path to store the Telegram session file for future use. diff --git a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py similarity index 92% rename from src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py rename to src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py index 3423010..5c3ad24 100644 --- a/src/auto_archiver/modules/instagram_tbot_archiver/instagram_tbot_archiver.py +++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py @@ -1,5 +1,5 @@ """ -InstagramTbotArchiver Module +InstagramTbotExtractor Module This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`). It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the @@ -15,18 +15,18 @@ from sqlite3 import OperationalError from loguru import logger from telethon.sync import TelegramClient -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str -class InstagramTbotArchiver(Archiver): +class InstagramTbotExtractor(Extractor): """ calls a telegram bot to fetch instagram posts/stories... and gets available media from it https://github.com/adw0rd/instagrapi https://t.me/instagram_load_bot """ - name = "instagram_tbot_archiver" + name = "instagram_tbot_extractor" def __init__(self, config: dict) -> None: super().__init__(config) @@ -49,7 +49,7 @@ class InstagramTbotArchiver(Archiver): try: self.client = TelegramClient(self.session_file, self.api_id, self.api_hash) except OperationalError as e: - logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") + logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}") with self.client.start(): logger.success(f"SETUP {self.name} login works.") diff --git a/src/auto_archiver/modules/twitter_api_archiver/__init__.py b/src/auto_archiver/modules/local/__init__.py similarity index 100% rename from src/auto_archiver/modules/twitter_api_archiver/__init__.py rename to src/auto_archiver/modules/local/__init__.py diff --git a/src/auto_archiver/modules/local/__manifest__.py b/src/auto_archiver/modules/local/__manifest__.py new file mode 100644 index 0000000..5220555 --- /dev/null +++ b/src/auto_archiver/modules/local/__manifest__.py @@ -0,0 +1,26 @@ +m = { + "name": "Local Storage", + "type": ["storage"], + "requires_setup": False, + "external_dependencies": { + "python": ["loguru"], + }, + "configs": { + # TODO: get base storage configs + "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, + }, + "description": """ + LocalStorage: A storage module for saving archived content locally on the filesystem. + + ### Features + - Saves archived media files to a specified folder on the local filesystem. + - Maintains file metadata during storage using `shutil.copy2`. + - Supports both absolute and relative paths for stored files, configurable via `save_absolute`. + - Automatically creates directories as needed for storing files. + + ### Notes + - Default storage folder is `./archived`, but this can be changed via the `save_to` configuration. + - The `save_absolute` option can reveal the file structure in output formats; use with caution. + """ +} diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/modules/local/local.py similarity index 94% rename from src/auto_archiver/storages/local.py rename to src/auto_archiver/modules/local/local.py index aa08e49..ef0966d 100644 --- a/src/auto_archiver/storages/local.py +++ b/src/auto_archiver/modules/local/local.py @@ -4,8 +4,8 @@ from typing import IO import os from loguru import logger -from ..core import Media -from ..storages import Storage +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage class LocalStorage(Storage): diff --git a/src/auto_archiver/modules/meta_enricher/meta_enricher.py b/src/auto_archiver/modules/meta_enricher/meta_enricher.py index ab0e73d..52d8eb2 100644 --- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py +++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py @@ -2,7 +2,7 @@ import datetime import os from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py index 5887d16..b729d36 100644 --- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py +++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py @@ -2,7 +2,7 @@ import subprocess import traceback from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/vk_archiver/__init__.py b/src/auto_archiver/modules/mute_formatter/__init__.py similarity index 100% rename from src/auto_archiver/modules/vk_archiver/__init__.py rename to src/auto_archiver/modules/mute_formatter/__init__.py diff --git a/src/auto_archiver/modules/mute_formatter/__manifest__.py b/src/auto_archiver/modules/mute_formatter/__manifest__.py new file mode 100644 index 0000000..af3f83a --- /dev/null +++ b/src/auto_archiver/modules/mute_formatter/__manifest__.py @@ -0,0 +1,9 @@ +m = { + "name": "Mute Formatter", + "type": ["formatter"], + "requires_setup": False, + "external_dependencies": { + }, + "description": """ Default formatter. + """, +} diff --git a/src/auto_archiver/formatters/mute_formatter.py b/src/auto_archiver/modules/mute_formatter/mute_formatter.py similarity index 100% rename from src/auto_archiver/formatters/mute_formatter.py rename to src/auto_archiver/modules/mute_formatter/mute_formatter.py diff --git a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py index e3e9d10..dc70465 100644 --- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py +++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py @@ -16,7 +16,7 @@ import numpy as np from PIL import Image, UnidentifiedImageError from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata diff --git a/src/auto_archiver/modules/s3/__init__.py b/src/auto_archiver/modules/s3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/s3/__manifest__.py b/src/auto_archiver/modules/s3/__manifest__.py new file mode 100644 index 0000000..239e0fe --- /dev/null +++ b/src/auto_archiver/modules/s3/__manifest__.py @@ -0,0 +1,40 @@ +m = { + "name": "S3 Storage", + "type": ["storage"], + "requires_setup": True, + "external_dependencies": { + "python": ["boto3", "loguru"], + }, + "configs": { + # TODO: get base storage configs + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + }, + "description": """ + S3Storage: A storage module for saving media files to an S3-compatible object storage. + + ### Features + - Uploads media files to an S3 bucket with customizable configurations. + - Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes. + - Automatically generates unique paths for files when duplicates are found. + - Configurable endpoint and CDN URL for different S3-compatible providers. + - Supports both private and public file storage, with public files being readable online. + + ### Notes + - Requires S3 credentials (API key and secret) and a bucket name to function. + - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures. + - Uses `boto3` for interaction with the S3 API. + """ +} diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/modules/s3/s3.py similarity index 95% rename from src/auto_archiver/storages/s3.py rename to src/auto_archiver/modules/s3/s3.py index 5139068..02b0613 100644 --- a/src/auto_archiver/storages/s3.py +++ b/src/auto_archiver/modules/s3/s3.py @@ -2,10 +2,11 @@ from typing import IO import boto3, os -from ..utils.misc import random_str -from ..core import Media -from ..storages import Storage -from ..enrichers import HashEnricher +from auto_archiver.utils.misc import random_str +from auto_archiver.core import Media +from auto_archiver.base_modules import Storage +# TODO +from auto_archiver.modules.hash_enricher import HashEnricher from loguru import logger NO_DUPLICATES_FOLDER = "no-dups/" diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index dd1d38a..f99c100 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -5,7 +5,7 @@ import base64 from selenium.common.exceptions import TimeoutException -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.utils import Webdriver, UrlUtil, random_str from auto_archiver.core import Media, Metadata, ArchivingContext diff --git a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py index 0474d8f..aba1d33 100644 --- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py +++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py @@ -3,7 +3,7 @@ from slugify import slugify from urllib.parse import urlparse from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media diff --git a/src/auto_archiver/modules/telegram_extractor/__init__.py b/src/auto_archiver/modules/telegram_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telegram_archiver/__manifest__.py b/src/auto_archiver/modules/telegram_extractor/__manifest__.py similarity index 78% rename from src/auto_archiver/modules/telegram_archiver/__manifest__.py rename to src/auto_archiver/modules/telegram_extractor/__manifest__.py index f3950b5..86b5e0f 100644 --- a/src/auto_archiver/modules/telegram_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telegram_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Telegram Archiver", + "name": "Telegram Extractor", "type": ["extractor"], - "entry_point": "telegram_archiver:TelegramArchiver", "requires_setup": False, "external_dependencies": { "python": [ @@ -11,7 +10,7 @@ ], }, "description": """ - The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. + The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` is advised for more comprehensive functionality. diff --git a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py similarity index 91% rename from src/auto_archiver/modules/telegram_archiver/telegram_archiver.py rename to src/auto_archiver/modules/telegram_extractor/telegram_extractor.py index c5e5ef0..047d424 100644 --- a/src/auto_archiver/modules/telegram_archiver/telegram_archiver.py +++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py @@ -2,16 +2,16 @@ import requests, re, html from bs4 import BeautifulSoup from loguru import logger -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media -class TelegramArchiver(Archiver): +class TelegramExtractor(Extractor): """ - Archiver for telegram that does not require login, but the telethon_archiver is much more advised, + Extractor for telegram that does not require login, but the telethon_extractor is much more advised, will only return if at least one image or one video is found """ - name = "telegram_archiver" + name = "telegram_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/telethon_extractor/__init__.py b/src/auto_archiver/modules/telethon_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/telethon_archiver/__manifest__.py b/src/auto_archiver/modules/telethon_extractor/__manifest__.py similarity index 90% rename from src/auto_archiver/modules/telethon_archiver/__manifest__.py rename to src/auto_archiver/modules/telethon_extractor/__manifest__.py index d44acf3..6f09ea6 100644 --- a/src/auto_archiver/modules/telethon_archiver/__manifest__.py +++ b/src/auto_archiver/modules/telethon_extractor/__manifest__.py @@ -1,8 +1,7 @@ # TODO rm dependency on json { - "name": "telethon_archiver", + "name": "telethon_extractor", "type": ["extractor"], - "entry_point": "telethon_archiver:TelethonArchiver", "requires_setup": True, "external_dependencies": { "python": ["telethon", @@ -25,7 +24,7 @@ } }, "description": """ -The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. +The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups. It supports private and public channels, downloading grouped posts with media, and can join channels using invite links if provided in the configuration. @@ -37,7 +36,7 @@ if provided in the configuration. - Outputs structured metadata and media using `Metadata` and `Media` objects. ### Setup -To use the `TelethonArchiver`, you must configure the following: +To use the `TelethonExtractor`, you must configure the following: - **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps). - **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`). - **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving. diff --git a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py similarity index 98% rename from src/auto_archiver/modules/telethon_archiver/telethon_archiver.py rename to src/auto_archiver/modules/telethon_extractor/telethon_archiver.py index fc89c9e..811a280 100644 --- a/src/auto_archiver/modules/telethon_archiver/telethon_archiver.py +++ b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py @@ -8,13 +8,13 @@ from loguru import logger from tqdm import tqdm import re, time, json, os -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext from auto_archiver.utils import random_str -class TelethonArchiver(Archiver): - name = "telethon_archiver" +class TelethonArchiver(Extractor): + name = "telethon_extractor" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") diff --git a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py index 3edd40c..a16d84a 100644 --- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py +++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py @@ -9,7 +9,7 @@ and identify important moments without watching the entire video. import ffmpeg, os from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Media, Metadata, ArchivingContext from auto_archiver.utils.misc import random_str diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index a9cf753..473f880 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext from asn1crypto import pem import certifi -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, ArchivingContext, Media -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor class TimestampingEnricher(Enricher): diff --git a/src/auto_archiver/modules/twitter_api_extractor/__init__.py b/src/auto_archiver/modules/twitter_api_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py similarity index 89% rename from src/auto_archiver/modules/twitter_api_archiver/__manifest__.py rename to src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 5dc7364..ae1b0ff 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "Twitter API Archiver", + "name": "Twitter API Extractor", "type": ["extractor"], - "entry_point": "twitter_api_archiver:TwitterApiArchiver", "requires_setup": True, "external_dependencies": { "python": ["requests", @@ -20,7 +19,7 @@ "access_secret": {"default": None, "help": "twitter API access_secret"}, }, "description": """ - The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. + The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API. It supports multiple API configurations for extended rate limits and reliable access. Features include URL expansion, media downloads (e.g., images, videos), and structured output via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens @@ -34,7 +33,7 @@ - Outputs structured metadata and media using `Metadata` and `Media` objects. ### Setup - To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration: + To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration: - **Bearer Token(s)**: A single token or a list for rate-limited API access. - **Consumer Key and Secret**: Required for user-authenticated API access. - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities. diff --git a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py similarity index 97% rename from src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py rename to src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py index 9c931ef..c5d03e0 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/twitter_api_archiver.py +++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py @@ -8,11 +8,11 @@ from loguru import logger from pytwitter import Api from slugify import slugify -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata,Media -class TwitterApiArchiver(Archiver): - name = "twitter_api_archiver" +class TwitterApiExtractor(Extractor): + name = "twitter_api_extractor" link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") def __init__(self, config: dict) -> None: diff --git a/src/auto_archiver/modules/vk_extractor/__init__.py b/src/auto_archiver/modules/vk_extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/auto_archiver/modules/vk_archiver/__manifest__.py b/src/auto_archiver/modules/vk_extractor/__manifest__.py similarity index 90% rename from src/auto_archiver/modules/vk_archiver/__manifest__.py rename to src/auto_archiver/modules/vk_extractor/__manifest__.py index 69bf162..bdcaf99 100644 --- a/src/auto_archiver/modules/vk_archiver/__manifest__.py +++ b/src/auto_archiver/modules/vk_extractor/__manifest__.py @@ -1,7 +1,6 @@ { - "name": "VKontakte Archiver", + "name": "VKontakte Extractor", "type": ["extractor"], - "entry_point": "vk_archiver:VKArchiver", "requires_setup": True, "depends": ["core", "utils"], "external_dependencies": { @@ -14,7 +13,7 @@ "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"}, }, "description": """ -The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. +The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract and download content. Note that VK videos are handled separately by the `YTDownloader`. diff --git a/src/auto_archiver/modules/vk_archiver/vk_archiver.py b/src/auto_archiver/modules/vk_extractor/vk_archiver.py similarity index 93% rename from src/auto_archiver/modules/vk_archiver/vk_archiver.py rename to src/auto_archiver/modules/vk_extractor/vk_archiver.py index 7ba7a68..2474769 100644 --- a/src/auto_archiver/modules/vk_archiver/vk_archiver.py +++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py @@ -2,16 +2,16 @@ from loguru import logger from vk_url_scraper import VkScraper from auto_archiver.utils.misc import dump_payload -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor from auto_archiver.core import Metadata, Media, ArchivingContext -class VkArchiver(Archiver): +class VkExtractor(Extractor): """" VK videos are handled by YTDownloader, this archiver gets posts text and images. Currently only works for /wall posts """ - name = "vk_archiver" + name = "vk_extractor" def __init__(self, config: dict) -> None: super().__init__(config) diff --git a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py index 124382b..3eb2b17 100644 --- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py +++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py @@ -6,12 +6,11 @@ from loguru import logger from warcio.archiveiterator import ArchiveIterator from auto_archiver.core import Media, Metadata, ArchivingContext -from auto_archiver.enrichers import Enricher -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor, Enricher from auto_archiver.utils import UrlUtil, random_str -class WaczArchiverEnricher(Enricher, Archiver): +class WaczExtractorEnricher(Enricher, Extractor): """ Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles) diff --git a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py index 8ddec82..bcd2450 100644 --- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py +++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py @@ -2,12 +2,11 @@ import json from loguru import logger import time, requests -from auto_archiver.enrichers import Enricher -from auto_archiver.archivers import Archiver +from auto_archiver.base_modules import Extractor, Enricher from auto_archiver.utils import UrlUtil from auto_archiver.core import Metadata -class WaybackArchiverEnricher(Enricher, Archiver): +class WaybackExtractorEnricher(Enricher, Extractor): """ Submits the current URL to the webarchive and returns a job_id or completed archive. diff --git a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py index f6294f3..a00ba25 100644 --- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py +++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py @@ -2,9 +2,9 @@ import traceback import requests, time from loguru import logger -from auto_archiver.enrichers import Enricher +from auto_archiver.base_modules import Enricher from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.storages import S3Storage +from auto_archiver.modules import S3Storage class WhisperEnricher(Enricher): diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py deleted file mode 100644 index 0765833..0000000 --- a/src/auto_archiver/storages/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -""" This module contains the storage classes for the auto-archiver. - -""" \ No newline at end of file diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py index d793706..721812a 100644 --- a/tests/archivers/test_archiver_base.py +++ b/tests/archivers/test_archiver_base.py @@ -1,9 +1,7 @@ import pytest -from auto_archiver.core import Metadata -from auto_archiver.core import Step from auto_archiver.core.metadata import Metadata -from auto_archiver.archivers.archiver import Archiver +from auto_archiver.base_modules.extractor import Extractor class TestArchiverBase(object): archiver_class: str = None @@ -13,7 +11,7 @@ class TestArchiverBase(object): def setup_archiver(self): assert self.archiver_class is not None, "self.archiver_class must be set on the subclass" assert self.config is not None, "self.config must be a dict set on the subclass" - self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config}) + self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config}) def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""): assert test_response is not False diff --git a/tests/formatters/test_html_formatter.py b/tests/formatters/test_html_formatter.py index 3540062..2719033 100644 --- a/tests/formatters/test_html_formatter.py +++ b/tests/formatters/test_html_formatter.py @@ -1,5 +1,4 @@ -from auto_archiver.core.context import ArchivingContext -from auto_archiver.formatters.html_formatter import HtmlFormatter +from auto_archiver.modules.html_formatter import HtmlFormatter from auto_archiver.core import Metadata, Media