Merge pull request #261 from bellingcat/wacz_separate_profile

Wacz minor adjustments
This commit is contained in:
Patrick Robertson
2025-03-20 15:51:56 +00:00
committed by GitHub
6 changed files with 55 additions and 20 deletions

View File

@@ -1,4 +1,3 @@
version: '3.8'
services: services:
auto-archiver: auto-archiver:
@@ -10,7 +9,4 @@ services:
volumes: volumes:
- ./secrets:/app/secrets - ./secrets:/app/secrets
- ./local_archive:/app/local_archive - ./local_archive:/app/local_archive
environment:
- WACZ_ENABLE_DOCKER=true
- RUNNING_IN_DOCKER=true
command: --config secrets/orchestration.yaml command: --config secrets/orchestration.yaml

View File

@@ -5,6 +5,7 @@ by handling user configuration, validating the steps properties, and implementin
""" """
from __future__ import annotations from __future__ import annotations
import subprocess
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, TYPE_CHECKING, Type from typing import List, TYPE_CHECKING, Type
@@ -17,7 +18,7 @@ import os
from os.path import join from os.path import join
from loguru import logger from loguru import logger
import auto_archiver import auto_archiver
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
if TYPE_CHECKING: if TYPE_CHECKING:
from .base_module import BaseModule from .base_module import BaseModule
@@ -220,9 +221,9 @@ class LazyBaseModule:
if not check(dep): if not check(dep):
logger.error( logger.error(
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \ f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information." Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
) )
exit(1) raise SetupError()
def check_python_dep(dep): def check_python_dep(dep):
# first check if it's a module: # first check if it's a module:
@@ -241,8 +242,22 @@ class LazyBaseModule:
return find_spec(dep) return find_spec(dep)
def check_bin_dep(dep):
dep_exists = shutil.which(dep)
if dep == "docker":
if os.environ.get("RUNNING_IN_DOCKER"):
# this is only for the WACZ enricher, which requires docker
# if we're already running in docker then we don't need docker
return True
# check if docker daemon is running
return dep_exists and subprocess.run(["docker", "ps", "-q"]).returncode == 0
return dep_exists
check_deps(self.dependencies.get("python", []), check_python_dep) check_deps(self.dependencies.get("python", []), check_python_dep)
check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep)) check_deps(self.dependencies.get("bin", []), check_bin_dep)
logger.debug(f"Loading module '{self.display_name}'...") logger.debug(f"Loading module '{self.display_name}'...")

View File

@@ -11,7 +11,7 @@
"configs": { "configs": {
"profile": { "profile": {
"default": None, "default": None,
"help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).", "help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
}, },
"docker_commands": {"default": None, "help": "if a custom docker invocation is needed"}, "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
@@ -40,14 +40,27 @@
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format. [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
### Features ## Setup
**Docker**
If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
the docker daemon to be able to run the `browsertrix-crawler` tool.
**Browsertrix Profiles**
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
for more information.
** Docker in Docker **
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
## Features
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
- Supports custom profiles for archiving private or dynamic content. - Supports custom profiles for archiving private or dynamic content.
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline. - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
- Generates metadata from the archived page's content and structure (e.g., titles, text). - Generates metadata from the archived page's content and structure (e.g., titles, text).
### Notes
- Requires Docker for running `browsertrix-crawler` .
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
""", """,
} }

View File

@@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER") self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER") self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
self.cwd_dind = f"/crawls/crawls{random_str(8)}" self.crawl_id = random_str(8)
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST") self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
# create crawls folder if not exists, so it can be safely removed in cleanup # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
url = to_enrich.get_url() url = to_enrich.get_url()
collection = random_str(8) collection = self.crawl_id
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir) browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
@@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
] + cmd ] + cmd
if self.profile: if self.profile:
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz") profile_file = f"profile-{self.crawl_id}.tar.gz"
profile_fn = os.path.join(browsertrix_home_container, profile_file)
logger.debug(f"copying {self.profile} to {profile_fn}") logger.debug(f"copying {self.profile} to {profile_fn}")
shutil.copyfile(self.profile, profile_fn) shutil.copyfile(self.profile, profile_fn)
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")]) cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
else: else:
logger.debug(f"generating WACZ without Docker for {url=}") logger.debug(f"generating WACZ without Docker for {url=}")

View File

@@ -4,6 +4,7 @@ from zipfile import ZipFile
import pytest import pytest
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core.consts import SetupError
@pytest.fixture @pytest.fixture
@@ -22,6 +23,15 @@ def wacz_enricher(setup_module, mock_binary_dependencies):
return wacz return wacz
def test_raises_error_without_docker_installed(setup_module, mocker, caplog):
# pretend that docker isn't installed
mocker.patch("shutil.which").return_value = None
with pytest.raises(SetupError):
setup_module("wacz_extractor_enricher", {})
assert "requires external dependency 'docker' which is not available/setup" in caplog.text
def test_setup_without_docker(wacz_enricher, mocker): def test_setup_without_docker(wacz_enricher, mocker):
mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True) mocker.patch.dict(os.environ, {"RUNNING_IN_DOCKER": "1"}, clear=True)
wacz_enricher.setup() wacz_enricher.setup()

View File

@@ -1,6 +1,7 @@
import pytest import pytest
from auto_archiver.core.module import ModuleFactory, LazyBaseModule from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule from auto_archiver.core.base_module import BaseModule
from auto_archiver.core.consts import SetupError
@pytest.fixture @pytest.fixture
@@ -25,11 +26,9 @@ def test_python_dependency_check(example_module):
# monkey patch the manifest to include a nonexistnet dependency # monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["python"] = ["does_not_exist"] example_module.manifest["dependencies"]["python"] = ["does_not_exist"]
with pytest.raises(SystemExit) as load_error: with pytest.raises(SetupError):
example_module.load({}) example_module.load({})
assert load_error.value.code == 1
def test_binary_dependency_check(example_module): def test_binary_dependency_check(example_module):
# example_module requires ffmpeg, which is not installed # example_module requires ffmpeg, which is not installed