Merge pull request #261 from bellingcat/wacz_separate_profile

Wacz minor adjustments
2026-06-12 05:08:28 +03:00 · 2025-03-20 15:51:56 +00:00
parent 613ba0c05d 1e19ad77c6
commit e6c5705f70
6 changed files with 55 additions and 20 deletions
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -11,7 +11,7 @@
    "configs": {
        "profile": {
            "default": None,
-            "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
+            "help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
        },
        "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
        "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
@@ -40,14 +40,27 @@
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

-    ### Features
+    ## Setup
+
+    **Docker**
+    If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
+    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
+    the docker daemon to be able to run the `browsertrix-crawler` tool.
+
+    **Browsertrix Profiles**
+    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
+    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
+    for more information.
+
+    ** Docker in Docker **
+    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
+    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
+
+    ## Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
    - Supports custom profiles for archiving private or dynamic content.
    - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

-    ### Notes
-    - Requires Docker for running `browsertrix-crawler` .
-    - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
    """,
 }
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
        self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
        self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")

-        self.cwd_dind = f"/crawls/crawls{random_str(8)}"
+        self.crawl_id = random_str(8)
+        self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
        self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
        self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
        # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):

        url = to_enrich.get_url()

-        collection = random_str(8)
+        collection = self.crawl_id
        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

@@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
                ] + cmd

            if self.profile:
-                profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
+                profile_file = f"profile-{self.crawl_id}.tar.gz"
+                profile_fn = os.path.join(browsertrix_home_container, profile_file)
                logger.debug(f"copying {self.profile} to {profile_fn}")
                shutil.copyfile(self.profile, profile_fn)
-                cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
+                cmd.extend(["--profile", os.path.join("/crawls", profile_file)])

        else:
            logger.debug(f"generating WACZ without Docker for {url=}")