Merge branch 'main' into timestamping_rewrite

2026-06-12 05:08:28 +03:00 · 2025-03-26 14:37:51 +04:00
parent 3c4625d708 74974ef0ed
commit d6be1ff84f
12 changed files with 601 additions and 448 deletions
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,3 +1,4 @@
+import sys
 import datetime
 import os
 import importlib
@@ -37,12 +38,24 @@ class GenericExtractor(Extractor):
                next_update_check = datetime.datetime.fromisoformat(f.read())

        if not next_update_check or next_update_check < datetime.datetime.now():
-            self.update_ytdlp()
+            updated = self.update_ytdlp()

            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
            with open(path, "w") as f:
                f.write(next_update_check.isoformat())

+            if not updated:
+                return
+
+            if os.environ.get("AUTO_ARCHIVER_ALLOW_RESTART", "1") != "1":
+                logger.warning(
+                    "yt-dlp has been updated. Auto archiver should be restarted for these changes to take effect"
+                )
+            else:
+                logger.warning("Restarting auto-archiver to apply yt-dlp update")
+                logger.warning(" ======= RESTARTING ======= ")
+                os.execv(sys.executable, [sys.executable] + sys.argv)
+
    def update_ytdlp(self):
        logger.info("Checking and updating yt-dlp...")
        logger.info(
@@ -58,12 +71,14 @@ class GenericExtractor(Extractor):
            if "Successfully installed yt-dlp" in result.stdout.decode():
                new_version = importlib.metadata.version("yt-dlp")
                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
-                importlib.reload(yt_dlp)
+                return True
            else:
                logger.info("yt-dlp already up to date")
+                return False

        except Exception as e:
            logger.error(f"Error updating yt-dlp: {e}")
+            return False

    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -40,27 +40,31 @@
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

-    ## Setup
-
-    **Docker**
-    If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
-    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
-    the docker daemon to be able to run the `browsertrix-crawler` tool.
-
-    **Browsertrix Profiles**
-    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
-    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
-    for more information.
-
-    ** Docker in Docker **
-    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
-    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
-
    ## Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
    - Supports custom profiles for archiving private or dynamic content.
    - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

+    ## Setup
+
+    ### Using Docker
+    If you are using the Auto Archiver [Docker image](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html#installing-with-docker)
+    to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
+    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
+    the docker daemon to be able to run the `browsertrix-crawler` tool.
+
+    ### Browsertrix Profiles
+    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
+    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile.
+    See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) for more information on how to use the `create-login-profile` tool.
+
+
+
+    ### Docker in Docker
+    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
+    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
+
+
    """,
 }
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -86,6 +86,12 @@ class WaczExtractorEnricher(Enricher, Extractor):
        if self.docker_in_docker:
            cmd.extend(["--cwd", self.cwd_dind])

+        if self.auth_for_site(url):
+            # there's an auth for this site, but browsertrix only supports username/password auth
+            logger.warning(
+                "The WACZ enricher / Browsertrix does not support using the 'authentication' information for logging in. You should consider creating a Browser Profile for WACZ archiving. More information: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/wacz_extractor_enricher.html#browsertrix-profiles"
+            )
+
        # call docker if explicitly enabled or we are running on the host (not in docker)
        if self.use_docker:
            logger.debug(f"generating WACZ in Docker for {url=}")