Further cleanup

* Removes (partly) the ArchivingOrchestrator * Removes the cli_feeder module, and makes it the 'default', allowing you to pass URLs directly on the command line, without having to use the cumbersome --cli_feeder.urls. Just do auto-archiver https://my.url.com * More unit tests * Improved error handling
2026-06-12 05:08:28 +03:00 · 2025-01-30 16:43:09 +01:00
parent 953011f368
commit d6b4b7a932
27 changed files with 417 additions and 191 deletions
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@@ -40,5 +40,3 @@ class AtlosFeeder(Feeder):

            if len(data["results"]) == 0 or cursor is None:
                break
-
-        logger.success(f"Processed {count} URL(s)")
--- a/src/auto_archiver/modules/cli_feeder/init.py
+++ b/src/auto_archiver/modules/cli_feeder/init.py
@@ -1 +0,0 @@
-from .cli_feeder import CLIFeeder
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@@ -1,27 +0,0 @@
-{
-    "name": "CLI Feeder",
-    "type": ["feeder"],
-    "requires_setup": False,
-    "dependencies": {
-        "python": ["loguru"],
-    },
-    'entry_point': 'cli_feeder::CLIFeeder',
-    "configs": {
-        "urls": {
-            "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-            "nargs": "+",
-            "required": True,
-            "do_not_store": True,
-            "metavar": "INPUT URLS",
-        },
-    },
-    "description": """
-    Processes URLs to archive passed via the command line and feeds them into the archiving pipeline.
-
-    ### Features
-    - Takes a single URL or a list of URLs provided via the command line.
-    - Converts each URL into a `Metadata` object and yields it for processing.
-    - Ensures URLs are processed only if they are explicitly provided.
-
-    """
-}
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -1,15 +0,0 @@
-from loguru import logger
-
-from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
-
-
-class CLIFeeder(Feeder):
-
-    def __iter__(self) -> Metadata:
-        for url in self.urls:
-            logger.debug(f"Processing URL: '{url}'")
-            yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@@ -26,7 +26,6 @@
    - Supports reading URLs from multiple input files, specified as a comma-separated list.
    - Allows specifying the column number or name to extract URLs from.
    - Skips header rows if the first value is not a valid URL.
-    - Integrates with the `ArchivingContext` to manage URL feeding.

    ### Setu N
    - Input files should be formatted with one URL per line.
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -20,6 +20,4 @@ class CSVFeeder(Feeder):
                    url = row[0]
                    logger.debug(f"Processing {url}")
                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
-
-        logger.success(f"Processed {len(self.urls)} URL(s)")
+            ArchivingContext.set("folder", "cli")
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -270,7 +270,11 @@ class GenericExtractor(Extractor):
            logger.debug('Using Facebook cookie')
            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
+                       'quiet': False, 'noplaylist': not self.allow_playlist ,
+                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
+                       "live_from_start": self.live_from_start, "proxy": self.proxy,
+                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}

        if item.netloc in ['youtube.com', 'www.youtube.com']:
            if self.cookies_from_browser:
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -7,7 +7,7 @@ import json
 import base64

 from auto_archiver.version import __version__
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.core import Formatter
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
@@ -46,7 +46,7 @@ class HtmlFormatter(Formatter):
            version=__version__
        )

-        html_path = os.path.join(ArchivingContext.get_tmp_dir(), f"formatted{random_str(24)}.html")
+        html_path = os.path.join(self.tmp_dir, f"formatted{random_str(24)}.html")
        with open(html_path, mode="w", encoding="utf-8") as outf:
            outf.write(content)
        final_media = Media(filename=html_path, _mimetype="text/html")
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -7,7 +7,7 @@ from selenium.common.exceptions import TimeoutException

 from auto_archiver.core import Enricher
 from auto_archiver.utils import Webdriver, UrlUtil, random_str
-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata

 class ScreenshotEnricher(Enricher):

@@ -23,11 +23,11 @@ class ScreenshotEnricher(Enricher):
            try:
                driver.get(url)
                time.sleep(int(self.sleep_before_screenshot))
-                screenshot_file = os.path.join(ArchivingContext.get_tmp_dir(), f"screenshot_{random_str(8)}.png")
+                screenshot_file = os.path.join(self.tmp_dir, f"screenshot_{random_str(8)}.png")
                driver.save_screenshot(screenshot_file)
                to_enrich.add_media(Media(filename=screenshot_file), id="screenshot")
                if self.save_to_pdf:
-                    pdf_file = os.path.join(ArchivingContext.get_tmp_dir(), f"pdf_{random_str(8)}.pdf")
+                    pdf_file = os.path.join(self.tmp_dir, f"pdf_{random_str(8)}.pdf")
                    pdf = driver.print_page(driver.print_options)
                    with open(pdf_file, "wb") as f:
                        f.write(base64.b64decode(pdf))
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@@ -23,6 +23,6 @@ class SSLEnricher(Enricher):
        logger.debug(f"fetching SSL certificate for {domain=} in {url=}")

        cert = ssl.get_server_certificate((domain, 443))
-        cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{slugify(domain)}.pem")
+        cert_fn = os.path.join(self.tmp_dir, f"{slugify(domain)}.pem")
        with open(cert_fn, "w") as f: f.write(cert)
        to_enrich.add_media(Media(filename=cert_fn), id="ssl_certificate")
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -9,7 +9,7 @@ from tqdm import tqdm
 import re, time, json, os

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str


@@ -120,7 +120,7 @@ class TelethonArchiver(Extractor):
            media_posts = self._get_media_posts_in_group(chat, post)
            logger.debug(f'got {len(media_posts)=} for {url=}')

-            tmp_dir = ArchivingContext.get_tmp_dir()
+            tmp_dir = self.tmp_dir

            group_id = post.grouped_id if post.grouped_id is not None else post.id
            title = post.message
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@@ -28,7 +28,7 @@ class ThumbnailEnricher(Enricher):
        logger.debug(f"generating thumbnails for {to_enrich.get_url()}")
        for m_id, m in enumerate(to_enrich.media[::]):
            if m.is_video():
-                folder = os.path.join(ArchivingContext.get_tmp_dir(), random_str(24))
+                folder = os.path.join(self.tmp_dir, random_str(24))
                os.makedirs(folder, exist_ok=True)
                logger.debug(f"generating thumbnails for {m.filename}")
                duration = m.get("duration")
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -9,9 +9,7 @@ from asn1crypto import pem
 import certifi

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
-from auto_archiver.core import Extractor
-
+from auto_archiver.core import Metadata, Media

 class TimestampingEnricher(Enricher):
    """
@@ -33,7 +31,7 @@ class TimestampingEnricher(Enricher):
            logger.warning(f"No hashes found in {url=}")
            return
        
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        hashes_fn = os.path.join(tmp_dir, "hashes.txt")

        data_to_sign = "\n".join(hashes)
@@ -93,7 +91,7 @@ class TimestampingEnricher(Enricher):

        cert_chain = []
        for cert in path:
-            cert_fn = os.path.join(ArchivingContext.get_tmp_dir(), f"{str(cert.serial_number)[:20]}.crt")
+            cert_fn = os.path.join(self.tmp_dir, f"{str(cert.serial_number)[:20]}.crt")
            with open(cert_fn, "wb") as f:
                f.write(cert.dump())
            cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.native["common_name"]))
--- a/src/auto_archiver/modules/vk_extractor/vk_extractor.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_extractor.py
@@ -3,7 +3,7 @@ from vk_url_scraper import VkScraper

 from auto_archiver.utils.misc import dump_payload
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media


 class VkExtractor(Extractor):
@@ -35,7 +35,7 @@ class VkExtractor(Extractor):

        result.set_content(dump_payload(vk_scrapes))

-        filenames = self.vks.download_media(vk_scrapes, ArchivingContext.get_tmp_dir())
+        filenames = self.vks.download_media(vk_scrapes, self.tmp_dir)
        for filename in filenames:
            result.add_media(Media(filename))

--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@@ -5,7 +5,7 @@ from zipfile import ZipFile
 from loguru import logger
 from warcio.archiveiterator import ArchiveIterator

-from auto_archiver.core import Media, Metadata, ArchivingContext
+from auto_archiver.core import Media, Metadata
 from auto_archiver.core import Extractor, Enricher
 from auto_archiver.utils import UrlUtil, random_str

@@ -51,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        url = to_enrich.get_url()

        collection = random_str(8)
-        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

        cmd = [
@@ -154,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        logger.info(f"WACZ extract_media or extract_screenshot flag is set, extracting media from {wacz_filename=}")

        # unzipping the .wacz
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        unzipped_dir = os.path.join(tmp_dir, "unzipped")
        with ZipFile(wacz_filename, 'r') as z_obj:
            z_obj.extractall(path=unzipped_dir)