V0.9.0 - closes several open issues: new enrichers and bug fixes (#133)

* clean orchestrator code, add archiver cleanup logic * improves documentation for database.py * telethon archivers isolate sessions into copied files * closes #127 * closes #125 * closes #84 * meta enricher applies to all media * closes #61 adds subtitles and comments * minor update * minor fixes to yt-dlp subtitles and comments * closes #17 but logic is imperfect. * closes #85 ssl enhancer * minimifies html, JS refactor for preview of certificates * closes #91 adds freetsa timestamp authority * version bump * simplify download_url method * skip ssl if nothing archived * html preview improvements * adds retrying lib * manual download archiver improvements * meta only runs when relevant data available * new metadata convenience method * html template improvements * removes debug message * does not close #91 yet, will need a few more certificate chaing logging * adds verbosity config * new instagram api archiver * adds proxy support we * adds proxy/end support and bug fix for yt-dlp * proxy support for webdriver * adds socks proxy to wacz_enricher * refactor recursivity in inner media and display * infinite recursive display * foolproofing timestamping authortities * version to 0.9.0 * minor fixes from code-review
2026-06-12 21:28:29 +03:00 · 2024-02-20 18:05:29 +00:00
parent 5c49124ac6
commit 7a21ae96af
34 changed files with 1696 additions and 880 deletions
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -1,6 +1,3 @@
-from loguru import logger
-
-
 class ArchivingContext:
    """
    Singleton context class.
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -44,10 +44,14 @@ class Media:
        """
        if include_self: yield self
        for prop in self.properties.values():
-            if isinstance(prop, Media): yield prop
+            if isinstance(prop, Media): 
+                for inner_media in prop.all_inner_media(include_self=True):
+                    yield inner_media
            if isinstance(prop, list):
                for prop_media in prop:
-                    if isinstance(prop_media, Media): yield prop_media
+                    if isinstance(prop_media, Media): 
+                        for inner_media in prop_media.all_inner_media(include_self=True):
+                            yield inner_media

    def is_stored(self) -> bool:
        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -54,6 +54,12 @@ class Metadata:
        self.metadata[key] = val
        return self

+    def append(self, key: str, val: Any) -> Metadata:
+        if key not in self.metadata:
+            self.metadata[key] = []    
+        self.metadata[key] = val
+        return self
+
    def get(self, key: str, default: Any = None, create_if_missing=False) -> Union[Metadata, str]:
        # goes through metadata and returns the Metadata available
        if create_if_missing and key not in self.metadata:
@@ -69,7 +75,8 @@ class Metadata:
        return "success" in self.status

    def is_empty(self) -> bool:
-        return not self.is_success() and len(self.media) == 0 and len(self.metadata) <= 2  # url, processed_at
+        meaningfull_ids = set(self.metadata.keys()) - set(["_processed_at", "url", "total_bytes", "total_size", "archive_duration_seconds"])
+        return not self.is_success() and len(self.media) == 0 and len(meaningfull_ids) == 0

    @property  # getter .netloc
    def netloc(self) -> str:
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -25,13 +25,28 @@ class ArchivingOrchestrator:
        self.storages: List[Storage] = config.storages
        ArchivingContext.set("storages", self.storages, keep_on_reset=True)

-        for a in self.archivers: a.setup()
+        try: 
+            for a in self.archivers: a.setup()
+        except (KeyboardInterrupt, Exception) as e:
+            logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}")
+            self.cleanup()
+
+
+    def cleanup(self)->None:
+        logger.info("Cleaning up")
+        for a in self.archivers: a.cleanup()

    def feed(self) -> Generator[Metadata]:
        for item in self.feeder:
            yield self.feed_item(item)
+        self.cleanup()

    def feed_item(self, item: Metadata) -> Metadata:
+        """
+        Takes one item (URL) to archive and calls self.archive, additionally:
+            - catches keyboard interruptions to do a clean exit
+            - catches any unexpected error, logs it, and does a clean exit
+        """
        try:
            ArchivingContext.reset()
            with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
@@ -41,36 +56,34 @@ class ArchivingOrchestrator:
            # catches keyboard interruptions to do a clean exit
            logger.warning(f"caught interrupt on {item=}")
            for d in self.databases: d.aborted(item)
+            self.cleanup()
            exit()
        except Exception as e:
            logger.error(f'Got unexpected error on item {item}: {e}\n{traceback.format_exc()}')
            for d in self.databases: d.failed(item)

-        # how does this handle the parameters like folder which can be different for each archiver?
-        # the storage needs to know where to archive!!
-        # solution: feeders have context: extra metadata that they can read or ignore,
-        # all of it should have sensible defaults (eg: folder)
-        # default feeder is a list with 1 element

    def archive(self, result: Metadata) -> Union[Metadata, None]:
+        """
+            Runs the archiving process for a single URL
+            1. Each archiver can sanitize its own URLs
+            2. Check for cached results in Databases, and signal start to the databases
+            3. Call Archivers until one succeeds
+            4. Call Enrichers
+            5. Store all downloaded/generated media
+            6. Call selected Formatter and store formatted if needed
+        """
        original_url = result.get_url()

-        # 1 - cleanup
-        # each archiver is responsible for cleaning/expanding its own URLs
+        # 1 - sanitize - each archiver is responsible for cleaning/expanding its own URLs
        url = original_url
        for a in self.archivers: url = a.sanitize_url(url)
        result.set_url(url)
        if original_url != url: result.set("original_url", original_url)

-        # 2 - notify start to DB
-        # signal to DB that archiving has started
-        # and propagate already archived if it exists
+        # 2 - notify start to DBs, propagate already archived if feature enabled in DBs
        cached_result = None
        for d in self.databases:
-            # are the databases to decide whether to archive?
-            # they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
-            # instagram profile example: it would always re-archive everything
-            # maybe the database/storage could use a hash/key to decide if there's a need to re-archive
            d.started(result)
            if (local_result := d.fetch(result)):
                cached_result = (cached_result or Metadata()).merge(local_result)
@@ -84,30 +97,21 @@ class ArchivingOrchestrator:
        for a in self.archivers:
            logger.info(f"Trying archiver {a.name} for {url}")
            try:
-                # Q: should this be refactored so it's just a.download(result)?
                result.merge(a.download(result))
                if result.is_success(): break
-            except Exception as e: logger.error(f"Unexpected error with archiver {a.name}: {e}: {traceback.format_exc()}")
+            except Exception as e: 
+                logger.error(f"ERROR archiver {a.name}: {e}: {traceback.format_exc()}")

-        # what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
-        # should it call the HTMLgenerator as if it's not an enrichment?
-        # eg: if it is enable: generates an HTML with all the returned media, should it include enrichers? yes
-        # then how to execute it last? should there also be post-processors? are there other examples?
-        # maybe as a PDF? or a Markdown file
-
-        # 4 - call enrichers: have access to archived content, can generate metadata and Media
-        # eg: screenshot, wacz, webarchive, thumbnails
+        # 4 - call enrichers to work with archived content
        for e in self.enrichers:
            try: e.enrich(result)
-            except Exception as exc: logger.error(f"Unexpected error with enricher {e.name}: {exc}: {traceback.format_exc()}")
+            except Exception as exc: 
+                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")

-        # 5 - store media
-        # looks for Media in result.media and also result.media[x].properties (as list or dict values)
+        # 5 - store all downloaded/generated media
        result.store()

-
        # 6 - format and store formatted if needed
-        # enrichers typically need access to already stored URLs etc
        if (final_media := self.formatter.format(result)):
            final_media.store(url=url)
            result.set_final_media(final_media)
@@ -115,7 +119,7 @@ class ArchivingOrchestrator:
        if result.is_empty():
            result.status = "nothing archived"

-        # signal completion to databases (DBs, Google Sheets, CSV, ...)
+        # signal completion to databases and archivers
        for d in self.databases: d.done(result)

        return result