Refactoring for new config setup

2026-06-12 21:28:29 +03:00 · 2025-01-27 19:03:02 +00:00
parent e3074013d0
commit e1a9373336
52 changed files with 219 additions and 242 deletions
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@@ -2,7 +2,7 @@ from typing import Union
 import requests, os
 from loguru import logger

-from auto_archiver.base_processors import Database
+from auto_archiver.core import Database
 from auto_archiver.core import Metadata


@@ -10,7 +10,6 @@ class AAApiDb(Database):
    """
        Connects to auto-archiver-api instance
    """
-    name = "auto_archiver_api_db"

    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
--- a/src/auto_archiver/modules/atlos/atlos.py
+++ b/src/auto_archiver/modules/atlos/atlos.py
@@ -5,15 +5,11 @@ import requests
 import hashlib

 from auto_archiver.core import Media, Metadata
-from auto_archiver.base_processors import Storage
+from auto_archiver.core import Storage
 from auto_archiver.utils import get_atlos_config_options


 class AtlosStorage(Storage):
-    name = "atlos_storage"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)

    def get_cdn_url(self, _media: Media) -> str:
        # It's not always possible to provide an exact URL, because it's
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@@ -6,7 +6,7 @@ from csv import DictWriter
 from dataclasses import asdict
 import requests

-from auto_archiver.base_processors import Database
+from auto_archiver.core import Database
 from auto_archiver.core import Metadata
 from auto_archiver.utils import get_atlos_config_options

@@ -16,12 +16,6 @@ class AtlosDb(Database):
    Outputs results to Atlos
    """

-    name = "atlos_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
    def failed(self, item: Metadata, reason: str) -> None:
        """Update DB accordingly for failure"""
        # If the item has no Atlos ID, there's nothing for us to do
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@@ -1,13 +1,12 @@
 from loguru import logger
 import requests

-from auto_archiver.base_processors import Feeder
+from auto_archiver.core import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from auto_archiver.utils import get_atlos_config_options


 class AtlosFeeder(Feeder):
-    name = "atlos_feeder"

    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -1,6 +1,6 @@
 from loguru import logger

-from auto_archiver.base_processors import Feeder
+from auto_archiver.core import Feeder
 from auto_archiver.core import Metadata, ArchivingContext


--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -1,6 +1,6 @@
 from loguru import logger

-from auto_archiver.base_processors import Database
+from auto_archiver.core import Database
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/csv_db/csv_db.py
+++ b/src/auto_archiver/modules/csv_db/csv_db.py
@@ -3,7 +3,7 @@ from loguru import logger
 from csv import DictWriter
 from dataclasses import asdict

-from auto_archiver.base_processors import Database
+from auto_archiver.core import Database
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -1,14 +1,12 @@
 from loguru import logger
 import csv

-from auto_archiver.base_processors import Feeder
+from auto_archiver.core import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from auto_archiver.utils import url_or_none

 class CSVFeeder(Feeder):

-    name = "csv_feeder"
-
    def __iter__(self) -> Metadata:
        url_column = self.column or 0
        for file in self.files:
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -10,11 +10,10 @@ from google.oauth2.credentials import Credentials
 from google.auth.transport.requests import Request

 from auto_archiver.core import Media
-from auto_archiver.base_processors import Storage
+from auto_archiver.core import Storage


 class GDriveStorage(Storage):
-    name = "gdrive_storage"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -1,13 +1,13 @@
 {
-    'name': 'Generic Extractor',
-    'version': '0.1.0',
-    'author': 'Bellingcat',
-    'type': ['extractor'],
-    'requires_setup': False,
-    'dependencies': {
-        'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
+    "name": "Generic Extractor",
+    "version": "0.1.0",
+    "author": "Bellingcat",
+    "type": ["extractor"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["yt_dlp", "requests", "loguru", "slugify"],
    },
-    'description': """
+    "description": """
 This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.

 This module is responsible for downloading and processing media content from platforms
@@ -28,17 +28,53 @@ the broader archiving framework.
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
 """,
-    'configs': {
-            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
-            "subtitles": {"default": True, "help": "download subtitles if available"},
-            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
-            "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
-            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
-            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
-            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
-            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
-            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
-            "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
-            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
-        }
-}
+    "configs": {
+        "facebook_cookie": {
+            "default": None,
+            "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
+        },
+        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
+        "comments": {
+            "default": False,
+            "help": "download all comments if available, may lead to large metadata",
+            "type": "bool",
+        },
+        "livestreams": {
+            "default": False,
+            "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control",
+            "type": "bool",
+        },
+        "live_from_start": {
+            "default": False,
+            "help": "if set, will download live streams from their earliest available moment, otherwise starts now.",
+            "type": "bool",
+        },
+        "proxy": {
+            "default": "",
+            "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
+        },
+        "end_means_success": {
+            "default": True,
+            "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
+            "type": "bool",
+        },
+        "allow_playlist": {
+            "default": False,
+            "help": "If True will also download playlists, set to False if the expectation is to download a single video.",
+            "type": "bool",
+        },
+        "max_downloads": {
+            "default": "inf",
+            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
+        },
+        "cookies_from_browser": {
+            "default": None,
+            "type": "str",
+            "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
+        },
+        "cookie_file": {
+            "default": None,
+            "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
+        },
+    },
+}
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -1,6 +1,6 @@
 from loguru import logger

-from auto_archiver.base_processors.extractor import Extractor
+from auto_archiver.core.extractor import Extractor
 from auto_archiver.core.metadata import Metadata, Media
 from .dropin import GenericDropin, InfoExtractor

--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@@ -1,6 +1,6 @@
 from yt_dlp.extractor.common import InfoExtractor
 from auto_archiver.core.metadata import Metadata
-from auto_archiver.base_processors.extractor import Extractor
+from auto_archiver.core.extractor import Extractor

 class GenericDropin:
    """Base class for dropins for the generic extractor.
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -5,11 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor

 from loguru import logger

-from auto_archiver.base_processors.extractor import Extractor
+from auto_archiver.core.extractor import Extractor
 from ...core import Metadata, Media, ArchivingContext

 class GenericExtractor(Extractor):
-    name = "youtubedl_archiver" #left as is for backwards compat
    _dropins = {}

    def suitable_extractors(self, url: str) -> list[str]:
@@ -268,7 +267,7 @@ class GenericExtractor(Extractor):
        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        
+
        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}

        if item.netloc in ['youtube.com', 'www.youtube.com']:
@@ -285,6 +284,6 @@ class GenericExtractor(Extractor):
            result = self.download_for_extractor(info_extractor, url, ydl)
            if result:
                return result
-       
+

        return False
--- a/src/auto_archiver/modules/generic_extractor/truth.py
+++ b/src/auto_archiver/modules/generic_extractor/truth.py
@@ -2,7 +2,7 @@ from typing import Type

 from auto_archiver.utils import traverse_obj
 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.base_processors.extractor import Extractor
+from auto_archiver.core.extractor import Extractor
 from yt_dlp.extractor.common import InfoExtractor

 from dateutil.parser import parse as parse_dt
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -6,7 +6,7 @@ from slugify import slugify

 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.utils import UrlUtil
-from auto_archiver.base_processors.extractor import Extractor
+from auto_archiver.core.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor

--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -5,7 +5,7 @@ from urllib.parse import quote

 from loguru import logger

-from auto_archiver.base_processors import Database
+from auto_archiver.core import Database
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.modules.gsheet_feeder import GWorksheet

@@ -15,11 +15,6 @@ class GsheetsDb(Database):
        NB: only works if GsheetFeeder is used. 
        could be updated in the future to support non-GsheetFeeder metadata 
    """
-    name = "gsheet_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)

    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -14,13 +14,12 @@ import gspread
 from loguru import logger
 from slugify import slugify

-from auto_archiver.base_processors import Feeder
+from auto_archiver.core import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from . import GWorksheet


 class GsheetsFeeder(Feeder):
-    name = "gsheet_feeder"

    def __init__(self) -> None:
        """
--- a/src/auto_archiver/modules/hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/hash_enricher/manifest.py
@@ -8,9 +8,9 @@
    "configs": {
            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
            # TODO add non-negative requirement to match previous implementation?
-            "chunksize": {"default": 1.6e7,
+            "chunksize": {"default": 16000000,
                          "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
-                          'type': 'positive_number',
+                          'type': 'int',
                          },
        },
    "description": """
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
 import hashlib
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, ArchivingContext


@@ -19,6 +19,17 @@ class HashEnricher(Enricher):
    Calculates hashes for Media instances
    """

+    def __init__(self, config: dict = None):
+        """
+        Initialize the HashEnricher with a configuration dictionary.
+        """
+        super().__init__()
+        # TODO set these from the manifest?
+        # Set default values
+        self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
+        self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
+
+
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
        logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -9,24 +9,30 @@ import base64

 from auto_archiver.version import __version__
 from auto_archiver.core import Metadata, Media, ArchivingContext
-from auto_archiver.base_processors import Formatter
+from auto_archiver.core import Formatter
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str


@dataclass
 class HtmlFormatter(Formatter):
+    environment: Environment = None
+    template: any = None

-    # TODO: fix setting up template with new config method
-    # def __init__(self, config: dict) -> None:
-    #     # without this STEP.__init__ is not called
-    #     super().__init__(config)
-    #     self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
-    #     # JinjaHelper class static methods are added as filters
-    #     self.environment.filters.update({
-    #         k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
-    #     })
-    #     self.template = self.environment.get_template("html_template.html")
+    def setup(self, config: dict) -> None:
+        """Sets up the Jinja2 environment and loads the template."""
+        super().setup(config)  # Ensure the base class logic is executed
+        template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
+        self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
+
+        # JinjaHelper class static methods are added as filters
+        self.environment.filters.update({
+            k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
+        })
+
+        # Load a specific template or default to "html_template.html"
+        template_name = self.config.get("template_name", "html_template.html")
+        self.template = self.environment.get_template(template_name)

    def format(self, item: Metadata) -> Media:
        url = item.get_url()
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -16,7 +16,7 @@ from loguru import logger
 from retrying import retry
 from tqdm import tqdm

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Media
 from auto_archiver.core import Metadata

@@ -28,8 +28,6 @@ class InstagramAPIExtractor(Extractor):
    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """

-    name = "instagram_api_extractor"
-
    global_pattern = re.compile(
        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
    )
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -7,7 +7,7 @@ import re, os, shutil, traceback
 import instaloader  # https://instaloader.github.io/as-module.html
 from loguru import logger

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata
 from auto_archiver.core import Media

@@ -15,8 +15,6 @@ class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
-    name = "instagram_extractor"
-
    # NB: post regex should be tested before profile
    # https://regex101.com/r/MGPquX/1
    post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@@ -15,7 +15,7 @@ from sqlite3 import OperationalError
 from loguru import logger
 from telethon.sync import TelegramClient

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.utils import random_str

@@ -26,13 +26,6 @@ class InstagramTbotExtractor(Extractor):
    https://github.com/adw0rd/instagrapi
    https://t.me/instagram_load_bot
    """
-    name = "instagram_tbot_extractor"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("api_id")
-        self.assert_valid_string("api_hash")
-        self.timeout = int(self.timeout)

    def setup(self) -> None:
        """
--- a/src/auto_archiver/modules/local_storage/local_storage.py
+++ b/src/auto_archiver/modules/local_storage/local_storage.py
@@ -5,17 +5,12 @@ import os
 from loguru import logger

 from auto_archiver.core import Media
-from auto_archiver.base_processors import Storage
+from auto_archiver.core import Storage


 class LocalStorage(Storage):
    name = "local_storage"

-    def __init__(self) -> None:
-        super().__init__()
-        # TODO: fix up passing config values to 'steps'
-        # os.makedirs(self.save_to, exist_ok=True)
-
    def get_cdn_url(self, media: Media) -> str:
        # TODO: is this viable with Storage.configs on path/filename?
        dest = os.path.join(self.save_to, media.key)
--- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py
+++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
@@ -2,7 +2,7 @@ import datetime
 import os
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
+++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
@@ -2,7 +2,7 @@ import subprocess
 import traceback
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py
+++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py
@@ -2,11 +2,10 @@ from __future__ import annotations
 from dataclasses import dataclass

 from auto_archiver.core import Metadata, Media
-from auto_archiver.base_processors import Formatter
+from auto_archiver.core import Formatter


@dataclass
 class MuteFormatter(Formatter):
-    name = "mute_formatter"

    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
+++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
@@ -16,7 +16,7 @@ import numpy as np
 from PIL import Image, UnidentifiedImageError
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/s3_storage/s3.py
+++ b/src/auto_archiver/modules/s3_storage/s3.py
@@ -4,14 +4,13 @@ import boto3, os

 from auto_archiver.utils.misc import random_str
 from auto_archiver.core import Media
-from auto_archiver.base_processors import Storage
-# TODO
+from auto_archiver.core import Storage
+
 from auto_archiver.modules.hash_enricher import HashEnricher
 from loguru import logger

 NO_DUPLICATES_FOLDER = "no-dups/"
 class S3Storage(Storage):
-    name = "s3_storage"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -5,15 +5,11 @@ import base64
 from selenium.common.exceptions import TimeoutException


-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.utils import Webdriver, UrlUtil, random_str
 from auto_archiver.core import Media, Metadata, ArchivingContext

 class ScreenshotEnricher(Enricher):
-    name = "screenshot_enricher"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@@ -3,7 +3,7 @@ from slugify import slugify
 from urllib.parse import urlparse
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, ArchivingContext, Media


--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@@ -2,7 +2,7 @@ import requests, re, html
 from bs4 import BeautifulSoup
 from loguru import logger

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media


--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -8,21 +8,15 @@ from loguru import logger
 from tqdm import tqdm
 import re, time, json, os

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.utils import random_str


 class TelethonArchiver(Extractor):
-    name = "telethon_extractor"
    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
    invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.assert_valid_string("api_id")
-        self.assert_valid_string("api_hash")
-

    def setup(self) -> None:
        """
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
 import ffmpeg, os
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Media, Metadata, ArchivingContext
 from auto_archiver.utils.misc import random_str

@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
                        logger.error(f"error getting duration of video {m.filename}: {e}")
                        return

-                num_thumbs = int(min(max(1, duration * self.thumbnails_per_second), self.max_thumbnails))
+                num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
                timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]

                thumbnails_media = []
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
 from asn1crypto import pem
 import certifi

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, ArchivingContext, Media
-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor


 class TimestampingEnricher(Enricher):
@@ -21,10 +21,6 @@ class TimestampingEnricher(Enricher):

    See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities.
    """
-    name = "timestamping_enricher"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -8,11 +8,10 @@ from loguru import logger
 from pytwitter import Api
 from slugify import slugify

-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata,Media

 class TwitterApiExtractor(Extractor):
-    name = "twitter_api_extractor"
    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

    def __init__(self, config: dict) -> None:
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -2,7 +2,7 @@ from loguru import logger
 from vk_url_scraper import VkScraper

 from auto_archiver.utils.misc import dump_payload
-from auto_archiver.base_processors import Extractor
+from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext


@@ -11,7 +11,6 @@ class VkExtractor(Extractor):
    VK videos are handled by YTDownloader, this archiver gets posts text and images.
    Currently only works for /wall posts
    """
-    name = "vk_extractor"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -6,7 +6,7 @@ from loguru import logger
 from warcio.archiveiterator import ArchiveIterator

 from auto_archiver.core import Media, Metadata, ArchivingContext
-from auto_archiver.base_processors import Extractor, Enricher
+from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import UrlUtil, random_str


@@ -17,11 +17,6 @@ class WaczExtractorEnricher(Enricher, Extractor):
    it can become quite powerful for archiving private content.
    When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
    """
-    name = "wacz_archiver_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)

    def setup(self) -> None:
        self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')
--- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py
+++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py
@@ -2,7 +2,7 @@ import json
 from loguru import logger
 import time, requests

-from auto_archiver.base_processors import Extractor, Enricher
+from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import UrlUtil
 from auto_archiver.core import Metadata

@@ -12,13 +12,6 @@ class WaybackExtractorEnricher(Enricher, Extractor):

    The Wayback machine will rate-limit IP heavy usage. 
    """
-    name = "wayback_archiver_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
-        assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"

    def download(self, item: Metadata) -> Metadata:
        # this new Metadata object is required to avoid duplication
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@@ -2,7 +2,7 @@ import traceback
 import requests, time
 from loguru import logger

-from auto_archiver.base_processors import Enricher
+from auto_archiver.core import Enricher
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.modules.s3_storage import S3Storage

@@ -13,14 +13,6 @@ class WhisperEnricher(Enricher):
    whisper API repository: https://github.com/bellingcat/whisperbox-transcribe/
    Only works if an S3 compatible storage is used
    """
-    name = "whisper_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint"
-        assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
-        self.timeout = int(self.timeout)

    def enrich(self, to_enrich: Metadata) -> None:
        if not self._get_s3_storage():