From 244341d22c3f0d96508a23e82ec5471948efc7f2 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 19 Mar 2025 18:08:04 +0400 Subject: [PATCH 1/5] Skip check for 'docker' bin dependency if already running in docker --- src/auto_archiver/core/module.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index 903a4ab..d086f6c 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -237,8 +237,13 @@ class LazyBaseModule: return find_spec(dep) + def check_bin_dep(dep): + if dep == "docker" and os.environ.get("RUNNING_IN_DOCKER"): + return True + return shutil.which(dep) + check_deps(self.dependencies.get("python", []), check_python_dep) - check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep)) + check_deps(self.dependencies.get("bin", []), check_bin_dep) logger.debug(f"Loading module '{self.display_name}'...") From e531906d7345ea23cd149e169aaaaecebbb015b2 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 19 Mar 2025 18:08:24 +0400 Subject: [PATCH 2/5] Create an independent profile file for each wacz_extractor_enricher instance --- .../wacz_extractor_enricher/wacz_extractor_enricher.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index 975d49a..b66f03c 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor): self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER") self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER") - self.cwd_dind = f"/crawls/crawls{random_str(8)}" + self.crawl_id = random_str(8) + self.cwd_dind = f"/crawls/crawls{self.crawl_id}" self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST") self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host # create crawls folder if not exists, so it can be safely removed in cleanup @@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor): url = to_enrich.get_url() - collection = random_str(8) + collection = self.crawl_id browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host @@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor): ] + cmd if self.profile: - profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz") + profile_file = f"profile-{self.crawl_id}.tar.gz" + profile_fn = os.path.join(browsertrix_home_container, profile_file) logger.debug(f"copying {self.profile} to {profile_fn}") shutil.copyfile(self.profile, profile_fn) - cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")]) + cmd.extend(["--profile", os.path.join("/crawls", profile_file)]) else: logger.debug(f"generating WACZ without Docker for {url=}") From 799cef3a8c9b691b0c856ec1e36b87bb1742ca85 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 16:28:19 +0400 Subject: [PATCH 3/5] Cleanup docker-compose --- docker-compose.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 56c2ccb..07ceb00 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,4 +1,3 @@ -version: '3.8' services: auto-archiver: @@ -10,7 +9,4 @@ services: volumes: - ./secrets:/app/secrets - ./local_archive:/app/local_archive - environment: - - WACZ_ENABLE_DOCKER=true - - RUNNING_IN_DOCKER=true command: --config secrets/orchestration.yaml From f22af5e123378b0c66f1e4ebb0c5e11edb965cd5 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 16:28:47 +0400 Subject: [PATCH 4/5] Tweak WACZ enricher docs + add comment on WACZ_ENABLE_DOCKER --- src/auto_archiver/core/module.py | 22 +++++++++++++----- .../wacz_extractor_enricher/__manifest__.py | 23 +++++++++++++++---- tests/enrichers/test_wacz_enricher.py | 10 ++++++++ 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py index d086f6c..9adb14a 100644 --- a/src/auto_archiver/core/module.py +++ b/src/auto_archiver/core/module.py @@ -5,6 +5,7 @@ by handling user configuration, validating the steps properties, and implementin """ from __future__ import annotations +import subprocess from dataclasses import dataclass from typing import List, TYPE_CHECKING, Type @@ -17,7 +18,7 @@ import os from os.path import join from loguru import logger import auto_archiver -from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE +from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError if TYPE_CHECKING: from .base_module import BaseModule @@ -216,9 +217,9 @@ class LazyBaseModule: if not check(dep): logger.error( f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ - Have you installed the required dependencies for the '{self.name}' module? See the README for more information." + Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information." ) - exit(1) + raise SetupError() def check_python_dep(dep): # first check if it's a module: @@ -238,9 +239,18 @@ class LazyBaseModule: return find_spec(dep) def check_bin_dep(dep): - if dep == "docker" and os.environ.get("RUNNING_IN_DOCKER"): - return True - return shutil.which(dep) + dep_exists = shutil.which(dep) + + if dep == "docker": + if os.environ.get("RUNNING_IN_DOCKER"): + # this is only for the WACZ enricher, which requires docker + # if we're already running in docker then we don't need docker + return True + + # check if docker daemon is running + return dep_exists and subprocess.run(["docker", "ps", "-q"]).returncode == 0 + + return dep_exists check_deps(self.dependencies.get("python", []), check_python_dep) check_deps(self.dependencies.get("bin", []), check_bin_dep) diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py index 97e3bf6..c6454b0 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py @@ -11,7 +11,7 @@ "configs": { "profile": { "default": None, - "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).", + "help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).", }, "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"}, @@ -40,14 +40,27 @@ Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format. - ### Features + ## Setup + + **Docker** + If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box! + Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run + the docker daemon to be able to run the `browsertrix-crawler` tool. + + **Browsertrix Profiles** + A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content. + You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) + for more information. + + ** Docker in Docker ** + If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool. + This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`. + + ## Features - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. - Supports custom profiles for archiving private or dynamic content. - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline. - Generates metadata from the archived page's content and structure (e.g., titles, text). - ### Notes - - Requires Docker for running `browsertrix-crawler` . - - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings. """, } diff --git a/tests/enrichers/test_wacz_enricher.py b/tests/enrichers/test_wacz_enricher.py index ceab83b..f4d1557 100644 --- a/tests/enrichers/test_wacz_enricher.py +++ b/tests/enrichers/test_wacz_enricher.py @@ -4,6 +4,7 @@ from zipfile import ZipFile import pytest from auto_archiver.core import Metadata, Media +from auto_archiver.core.consts import SetupError @pytest.fixture @@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies): return wacz +def test_raises_error_without_docker_installed(setup_module, mocker, caplog): + # pretend that docker isn't installed + mocker.patch("shutil.which").return_value = None + with pytest.raises(SetupError): + setup_module("wacz_extractor_enricher", {}) + + assert "requires external dependency 'docker' which is not available/setup" in caplog.text + + def test_setup_without_docker(wacz_enricher, mocker): mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True) wacz_enricher.setup() From 1e19ad77c628c3090e78c8093b48567ece65451a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Thu, 20 Mar 2025 18:08:19 +0400 Subject: [PATCH 5/5] Fix tests --- tests/test_modules.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_modules.py b/tests/test_modules.py index f672ca6..7067db3 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,6 +1,7 @@ import pytest from auto_archiver.core.module import ModuleFactory, LazyBaseModule from auto_archiver.core.base_module import BaseModule +from auto_archiver.core.consts import SetupError @pytest.fixture @@ -25,11 +26,9 @@ def test_python_dependency_check(example_module): # monkey patch the manifest to include a nonexistnet dependency example_module.manifest["dependencies"]["python"] = ["does_not_exist"] - with pytest.raises(SystemExit) as load_error: + with pytest.raises(SetupError): example_module.load({}) - assert load_error.value.code == 1 - def test_binary_dependency_check(example_module): # example_module requires ffmpeg, which is not installed