diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py index c6454b0..d06a6b3 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/__manifest__.py @@ -40,27 +40,31 @@ Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving. [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format. - ## Setup - - **Docker** - If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box! - Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run - the docker daemon to be able to run the `browsertrix-crawler` tool. - - **Browsertrix Profiles** - A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content. - You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) - for more information. - - ** Docker in Docker ** - If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool. - This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`. - ## Features - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`. - Supports custom profiles for archiving private or dynamic content. - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline. - Generates metadata from the archived page's content and structure (e.g., titles, text). + ## Setup + + ### Using Docker + If you are using the Auto Archiver [Docker image](https://auto-archiver.readthedocs.io/en/latest/installation/installation.html#installing-with-docker) + to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box! + Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run + the docker daemon to be able to run the `browsertrix-crawler` tool. + + ### Browsertrix Profiles + A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content. + You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. + See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/) for more information on how to use the `create-login-profile` tool. + + + + ### Docker in Docker + If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool. + This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`. + + """, } diff --git a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py index b66f03c..447d78f 100644 --- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py +++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py @@ -86,6 +86,12 @@ class WaczExtractorEnricher(Enricher, Extractor): if self.docker_in_docker: cmd.extend(["--cwd", self.cwd_dind]) + if self.auth_for_site(url): + # there's an auth for this site, but browsertrix only supports username/password auth + logger.warning( + "The WACZ enricher / Browsertrix does not support using the 'authentication' information for logging in. You should consider creating a Browser Profile for WACZ archiving. More information: https://auto-archiver.readthedocs.io/en/latest/modules/autogen/extractor/wacz_extractor_enricher.html#browsertrix-profiles" + ) + # call docker if explicitly enabled or we are running on the host (not in docker) if self.use_docker: logger.debug(f"generating WACZ in Docker for {url=}")