Merge pull request #402 from bellingcat/dev

bug fix: wacz screenshots leak in shared session
2026-06-07 19:08:30 +03:00 · 2026-02-25 10:39:54 +00:00
parent 9a97fede43 a739361e12
commit 52a7cabaf1
2 changed files with 10 additions and 8 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

 [project]
 name = "auto-archiver"
-version = "1.2.1"
+version = "1.2.2"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."

 requires-python = ">=3.10,<3.13"
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
        self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
        self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")

-        self.crawl_id = random_str(8)
-        self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
+        self.cwd_dind = f"/crawls/crawls{random_str(8)}"
        self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
        self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
        # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor):

        url = to_enrich.get_url()

-        collection = self.crawl_id
+        crawl_id = random_str(8)
+        collection = crawl_id
        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

@@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
            # "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
        ]

+        crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id)
        if self.docker_in_docker:
-            cmd.extend(["--cwd", self.cwd_dind])
+            os.makedirs(crawl_cwd_dind, exist_ok=True)
+            cmd.extend(["--cwd", crawl_cwd_dind])

        if self.auth_for_site(url):
            # there's an auth for this site, but browsertrix only supports username/password auth
@@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
                ] + cmd

            if self.profile:
-                profile_file = f"profile-{self.crawl_id}.tar.gz"
+                profile_file = f"profile-{crawl_id}.tar.gz"
                profile_fn = os.path.join(browsertrix_home_container, profile_file)
                logger.debug(f"Copying {self.profile} to {profile_fn}")
                shutil.copyfile(self.profile, profile_fn)
@@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
            return False

        if self.docker_in_docker:
-            wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz")
+            wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz")
        elif self.use_docker:
            wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
        else:
@@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
            self.extract_media_from_wacz(to_enrich, wacz_fn)

        if self.docker_in_docker:
-            jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl")
+            jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl")
        elif self.use_docker:
            jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
        else: