Merge pull request #402 from bellingcat/dev

bug fix: wacz screenshots leak in shared session
This commit is contained in:
Miguel Sozinho Ramalho
2026-02-25 10:39:54 +00:00
committed by GitHub
2 changed files with 10 additions and 8 deletions

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project] [project]
name = "auto-archiver" name = "auto-archiver"
version = "1.2.1" version = "1.2.2"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13" requires-python = ">=3.10,<3.13"

View File

@@ -24,8 +24,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER") self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER") self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
self.crawl_id = random_str(8) self.cwd_dind = f"/crawls/crawls{random_str(8)}"
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST") self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
# create crawls folder if not exists, so it can be safely removed in cleanup # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -51,7 +50,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url() url = to_enrich.get_url()
collection = self.crawl_id crawl_id = random_str(8)
collection = crawl_id
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
@@ -83,8 +83,10 @@ class WaczExtractorEnricher(Enricher, Extractor):
# "--blockAds" # note: this has been known to cause issues on cloudflare protected sites # "--blockAds" # note: this has been known to cause issues on cloudflare protected sites
] ]
crawl_cwd_dind = os.path.join(self.cwd_dind, crawl_id)
if self.docker_in_docker: if self.docker_in_docker:
cmd.extend(["--cwd", self.cwd_dind]) os.makedirs(crawl_cwd_dind, exist_ok=True)
cmd.extend(["--cwd", crawl_cwd_dind])
if self.auth_for_site(url): if self.auth_for_site(url):
# there's an auth for this site, but browsertrix only supports username/password auth # there's an auth for this site, but browsertrix only supports username/password auth
@@ -109,7 +111,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
] + cmd ] + cmd
if self.profile: if self.profile:
profile_file = f"profile-{self.crawl_id}.tar.gz" profile_file = f"profile-{crawl_id}.tar.gz"
profile_fn = os.path.join(browsertrix_home_container, profile_file) profile_fn = os.path.join(browsertrix_home_container, profile_file)
logger.debug(f"Copying {self.profile} to {profile_fn}") logger.debug(f"Copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn) shutil.copyfile(self.profile, profile_fn)
@@ -137,7 +139,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
return False return False
if self.docker_in_docker: if self.docker_in_docker:
wacz_fn = os.path.join(self.cwd_dind, "collections", collection, f"{collection}.wacz") wacz_fn = os.path.join(crawl_cwd_dind, "collections", collection, f"{collection}.wacz")
elif self.use_docker: elif self.use_docker:
wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz") wacz_fn = os.path.join(browsertrix_home_container, "collections", collection, f"{collection}.wacz")
else: else:
@@ -152,7 +154,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
self.extract_media_from_wacz(to_enrich, wacz_fn) self.extract_media_from_wacz(to_enrich, wacz_fn)
if self.docker_in_docker: if self.docker_in_docker:
jsonl_fn = os.path.join(self.cwd_dind, "collections", collection, "pages", "pages.jsonl") jsonl_fn = os.path.join(crawl_cwd_dind, "collections", collection, "pages", "pages.jsonl")
elif self.use_docker: elif self.use_docker:
jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl") jsonl_fn = os.path.join(browsertrix_home_container, "collections", collection, "pages", "pages.jsonl")
else: else: