Refactoring for new config setup

This commit is contained in:
erinhmclark
2025-01-27 19:03:02 +00:00
parent e3074013d0
commit e1a9373336
52 changed files with 219 additions and 242 deletions

View File

@@ -2,7 +2,7 @@ from typing import Union
import requests, os
from loguru import logger
from auto_archiver.base_processors import Database
from auto_archiver.core import Database
from auto_archiver.core import Metadata
@@ -10,7 +10,6 @@ class AAApiDb(Database):
"""
Connects to auto-archiver-api instance
"""
name = "auto_archiver_api_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called

View File

@@ -5,15 +5,11 @@ import requests
import hashlib
from auto_archiver.core import Media, Metadata
from auto_archiver.base_processors import Storage
from auto_archiver.core import Storage
from auto_archiver.utils import get_atlos_config_options
class AtlosStorage(Storage):
name = "atlos_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)
def get_cdn_url(self, _media: Media) -> str:
# It's not always possible to provide an exact URL, because it's

View File

@@ -6,7 +6,7 @@ from csv import DictWriter
from dataclasses import asdict
import requests
from auto_archiver.base_processors import Database
from auto_archiver.core import Database
from auto_archiver.core import Metadata
from auto_archiver.utils import get_atlos_config_options
@@ -16,12 +16,6 @@ class AtlosDb(Database):
Outputs results to Atlos
"""
name = "atlos_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def failed(self, item: Metadata, reason: str) -> None:
"""Update DB accordingly for failure"""
# If the item has no Atlos ID, there's nothing for us to do

View File

@@ -1,13 +1,12 @@
from loguru import logger
import requests
from auto_archiver.base_processors import Feeder
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import get_atlos_config_options
class AtlosFeeder(Feeder):
name = "atlos_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called

View File

@@ -1,6 +1,6 @@
from loguru import logger
from auto_archiver.base_processors import Feeder
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext

View File

@@ -1,6 +1,6 @@
from loguru import logger
from auto_archiver.base_processors import Database
from auto_archiver.core import Database
from auto_archiver.core import Metadata

View File

@@ -3,7 +3,7 @@ from loguru import logger
from csv import DictWriter
from dataclasses import asdict
from auto_archiver.base_processors import Database
from auto_archiver.core import Database
from auto_archiver.core import Metadata

View File

@@ -1,14 +1,12 @@
from loguru import logger
import csv
from auto_archiver.base_processors import Feeder
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
name = "csv_feeder"
def __iter__(self) -> Metadata:
url_column = self.column or 0
for file in self.files:

View File

@@ -10,11 +10,10 @@ from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from auto_archiver.core import Media
from auto_archiver.base_processors import Storage
from auto_archiver.core import Storage
class GDriveStorage(Storage):
name = "gdrive_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)

View File

@@ -1,13 +1,13 @@
{
'name': 'Generic Extractor',
'version': '0.1.0',
'author': 'Bellingcat',
'type': ['extractor'],
'requires_setup': False,
'dependencies': {
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
"name": "Generic Extractor",
"version": "0.1.0",
"author": "Bellingcat",
"type": ["extractor"],
"requires_setup": False,
"dependencies": {
"python": ["yt_dlp", "requests", "loguru", "slugify"],
},
'description': """
"description": """
This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.
This module is responsible for downloading and processing media content from platforms
@@ -28,17 +28,53 @@ the broader archiving framework.
custom dropins can be created to handle additional websites and passed to the archiver
via the command line using the `--dropins` option (TODO!).
""",
'configs': {
"facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
"subtitles": {"default": True, "help": "download subtitles if available"},
"comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
"livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
}
}
"configs": {
"facebook_cookie": {
"default": None,
"help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'",
},
"subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
"comments": {
"default": False,
"help": "download all comments if available, may lead to large metadata",
"type": "bool",
},
"livestreams": {
"default": False,
"help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control",
"type": "bool",
},
"live_from_start": {
"default": False,
"help": "if set, will download live streams from their earliest available moment, otherwise starts now.",
"type": "bool",
},
"proxy": {
"default": "",
"help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port",
},
"end_means_success": {
"default": True,
"help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve.",
"type": "bool",
},
"allow_playlist": {
"default": False,
"help": "If True will also download playlists, set to False if the expectation is to download a single video.",
"type": "bool",
},
"max_downloads": {
"default": "inf",
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
},
"cookies_from_browser": {
"default": None,
"type": "str",
"help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale",
},
"cookie_file": {
"default": None,
"help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp",
},
},
}

View File

@@ -1,6 +1,6 @@
from loguru import logger
from auto_archiver.base_processors.extractor import Extractor
from auto_archiver.core.extractor import Extractor
from auto_archiver.core.metadata import Metadata, Media
from .dropin import GenericDropin, InfoExtractor

View File

@@ -1,6 +1,6 @@
from yt_dlp.extractor.common import InfoExtractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.base_processors.extractor import Extractor
from auto_archiver.core.extractor import Extractor
class GenericDropin:
"""Base class for dropins for the generic extractor.

View File

@@ -5,11 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
from auto_archiver.base_processors.extractor import Extractor
from auto_archiver.core.extractor import Extractor
from ...core import Metadata, Media, ArchivingContext
class GenericExtractor(Extractor):
name = "youtubedl_archiver" #left as is for backwards compat
_dropins = {}
def suitable_extractors(self, url: str) -> list[str]:
@@ -268,7 +267,7 @@ class GenericExtractor(Extractor):
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
@@ -285,6 +284,6 @@ class GenericExtractor(Extractor):
result = self.download_for_extractor(info_extractor, url, ydl)
if result:
return result
return False

View File

@@ -2,7 +2,7 @@ from typing import Type
from auto_archiver.utils import traverse_obj
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.base_processors.extractor import Extractor
from auto_archiver.core.extractor import Extractor
from yt_dlp.extractor.common import InfoExtractor
from dateutil.parser import parse as parse_dt

View File

@@ -6,7 +6,7 @@ from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import UrlUtil
from auto_archiver.base_processors.extractor import Extractor
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor

View File

@@ -5,7 +5,7 @@ from urllib.parse import quote
from loguru import logger
from auto_archiver.base_processors import Database
from auto_archiver.core import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.modules.gsheet_feeder import GWorksheet
@@ -15,11 +15,6 @@ class GsheetsDb(Database):
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
name = "gsheet_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")

View File

@@ -14,13 +14,12 @@ import gspread
from loguru import logger
from slugify import slugify
from auto_archiver.base_processors import Feeder
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from . import GWorksheet
class GsheetsFeeder(Feeder):
name = "gsheet_feeder"
def __init__(self) -> None:
"""

View File

@@ -8,9 +8,9 @@
"configs": {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
# TODO add non-negative requirement to match previous implementation?
"chunksize": {"default": 1.6e7,
"chunksize": {"default": 16000000,
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
'type': 'positive_number',
'type': 'int',
},
},
"description": """

View File

@@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
import hashlib
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext
@@ -19,6 +19,17 @@ class HashEnricher(Enricher):
Calculates hashes for Media instances
"""
def __init__(self, config: dict = None):
"""
Initialize the HashEnricher with a configuration dictionary.
"""
super().__init__()
# TODO set these from the manifest?
# Set default values
self.algorithm = config.get("algorithm", "SHA-256") if config else "SHA-256"
self.chunksize = config.get("chunksize", int(1.6e7)) if config else int(1.6e7)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()
logger.debug(f"calculating media hashes for {url=} (using {self.algorithm})")

View File

@@ -9,24 +9,30 @@ import base64
from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.base_processors import Formatter
from auto_archiver.core import Formatter
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
@dataclass
class HtmlFormatter(Formatter):
environment: Environment = None
template: any = None
# TODO: fix setting up template with new config method
# def __init__(self, config: dict) -> None:
# # without this STEP.__init__ is not called
# super().__init__(config)
# self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
# # JinjaHelper class static methods are added as filters
# self.environment.filters.update({
# k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
# })
# self.template = self.environment.get_template("html_template.html")
def setup(self, config: dict) -> None:
"""Sets up the Jinja2 environment and loads the template."""
super().setup(config) # Ensure the base class logic is executed
template_dir = os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")
self.environment = Environment(loader=FileSystemLoader(template_dir), autoescape=True)
# JinjaHelper class static methods are added as filters
self.environment.filters.update({
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
})
# Load a specific template or default to "html_template.html"
template_name = self.config.get("template_name", "html_template.html")
self.template = self.environment.get_template(template_name)
def format(self, item: Metadata) -> Media:
url = item.get_url()

View File

@@ -16,7 +16,7 @@ from loguru import logger
from retrying import retry
from tqdm import tqdm
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Media
from auto_archiver.core import Metadata
@@ -28,8 +28,6 @@ class InstagramAPIExtractor(Extractor):
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
name = "instagram_api_extractor"
global_pattern = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
)

View File

@@ -7,7 +7,7 @@ import re, os, shutil, traceback
import instaloader # https://instaloader.github.io/as-module.html
from loguru import logger
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata
from auto_archiver.core import Media
@@ -15,8 +15,6 @@ class InstagramExtractor(Extractor):
"""
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
name = "instagram_extractor"
# NB: post regex should be tested before profile
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")

View File

@@ -15,7 +15,7 @@ from sqlite3 import OperationalError
from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
@@ -26,13 +26,6 @@ class InstagramTbotExtractor(Extractor):
https://github.com/adw0rd/instagrapi
https://t.me/instagram_load_bot
"""
name = "instagram_tbot_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("api_id")
self.assert_valid_string("api_hash")
self.timeout = int(self.timeout)
def setup(self) -> None:
"""

View File

@@ -5,17 +5,12 @@ import os
from loguru import logger
from auto_archiver.core import Media
from auto_archiver.base_processors import Storage
from auto_archiver.core import Storage
class LocalStorage(Storage):
name = "local_storage"
def __init__(self) -> None:
super().__init__()
# TODO: fix up passing config values to 'steps'
# os.makedirs(self.save_to, exist_ok=True)
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)

View File

@@ -2,7 +2,7 @@ import datetime
import os
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -2,7 +2,7 @@ import subprocess
import traceback
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -2,11 +2,10 @@ from __future__ import annotations
from dataclasses import dataclass
from auto_archiver.core import Metadata, Media
from auto_archiver.base_processors import Formatter
from auto_archiver.core import Formatter
@dataclass
class MuteFormatter(Formatter):
name = "mute_formatter"
def format(self, item: Metadata) -> Media: return None

View File

@@ -16,7 +16,7 @@ import numpy as np
from PIL import Image, UnidentifiedImageError
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata

View File

@@ -4,14 +4,13 @@ import boto3, os
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media
from auto_archiver.base_processors import Storage
# TODO
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
from loguru import logger
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage):
name = "s3_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)

View File

@@ -5,15 +5,11 @@ import base64
from selenium.common.exceptions import TimeoutException
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.utils import Webdriver, UrlUtil, random_str
from auto_archiver.core import Media, Metadata, ArchivingContext
class ScreenshotEnricher(Enricher):
name = "screenshot_enricher"
def __init__(self, config: dict) -> None:
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

View File

@@ -3,7 +3,7 @@ from slugify import slugify
from urllib.parse import urlparse
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media

View File

@@ -2,7 +2,7 @@ import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media

View File

@@ -8,21 +8,15 @@ from loguru import logger
from tqdm import tqdm
import re, time, json, os
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
name = "telethon_extractor"
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("api_id")
self.assert_valid_string("api_hash")
def setup(self) -> None:
"""

View File

@@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
import ffmpeg, os
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.utils.misc import random_str
@@ -42,7 +42,7 @@ class ThumbnailEnricher(Enricher):
logger.error(f"error getting duration of video {m.filename}: {e}")
return
num_thumbs = int(min(max(1, duration * self.thumbnails_per_second), self.max_thumbnails))
num_thumbs = int(min(max(1, duration * self.thumbnails_per_minute), self.max_thumbnails))
timestamps = [duration / (num_thumbs + 1) * i for i in range(1, num_thumbs + 1)]
thumbnails_media = []

View File

@@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
from asn1crypto import pem
import certifi
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
class TimestampingEnricher(Enricher):
@@ -21,10 +21,6 @@ class TimestampingEnricher(Enricher):
See https://gist.github.com/Manouchehri/fd754e402d98430243455713efada710 for list of timestamp authorities.
"""
name = "timestamping_enricher"
def __init__(self, config: dict) -> None:
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

View File

@@ -8,11 +8,10 @@ from loguru import logger
from pytwitter import Api
from slugify import slugify
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata,Media
class TwitterApiExtractor(Extractor):
name = "twitter_api_extractor"
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def __init__(self, config: dict) -> None:

View File

@@ -2,7 +2,7 @@ from loguru import logger
from vk_url_scraper import VkScraper
from auto_archiver.utils.misc import dump_payload
from auto_archiver.base_processors import Extractor
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
@@ -11,7 +11,6 @@ class VkExtractor(Extractor):
VK videos are handled by YTDownloader, this archiver gets posts text and images.
Currently only works for /wall posts
"""
name = "vk_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)

View File

@@ -6,7 +6,7 @@ from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from auto_archiver.core import Media, Metadata, ArchivingContext
from auto_archiver.base_processors import Extractor, Enricher
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil, random_str
@@ -17,11 +17,6 @@ class WaczExtractorEnricher(Enricher, Extractor):
it can become quite powerful for archiving private content.
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
"""
name = "wacz_archiver_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def setup(self) -> None:
self.use_docker = os.environ.get('WACZ_ENABLE_DOCKER') or not os.environ.get('RUNNING_IN_DOCKER')

View File

@@ -2,7 +2,7 @@ import json
from loguru import logger
import time, requests
from auto_archiver.base_processors import Extractor, Enricher
from auto_archiver.core import Extractor, Enricher
from auto_archiver.utils import UrlUtil
from auto_archiver.core import Metadata
@@ -12,13 +12,6 @@ class WaybackExtractorEnricher(Enricher, Extractor):
The Wayback machine will rate-limit IP heavy usage.
"""
name = "wayback_archiver_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API key"
assert type(self.secret) == str and len(self.secret) > 0, "please provide a value for the wayback_enricher API secret"
def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication

View File

@@ -2,7 +2,7 @@ import traceback
import requests, time
from loguru import logger
from auto_archiver.base_processors import Enricher
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.modules.s3_storage import S3Storage
@@ -13,14 +13,6 @@ class WhisperEnricher(Enricher):
whisper API repository: https://github.com/bellingcat/whisperbox-transcribe/
Only works if an S3 compatible storage is used
"""
name = "whisper_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert type(self.api_endpoint) == str and len(self.api_endpoint) > 0, "please provide a value for the whisper_enricher api_endpoint"
assert type(self.api_key) == str and len(self.api_key) > 0, "please provide a value for the whisper_enricher api_key"
self.timeout = int(self.timeout)
def enrich(self, to_enrich: Metadata) -> None:
if not self._get_s3_storage():