Merge branch 'main' into feat/yt-dlp-pots

# Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py # tests/test_modules.py
2026-06-13 05:38:29 +03:00 · 2025-03-25 15:16:31 +00:00
parent 93921e71d4 5c6005d843
commit b4c33318c4
28 changed files with 586 additions and 121 deletions
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -74,10 +74,6 @@ If you are having issues with the extractor, you can review the version of `yt-d
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
-        "pot_provider": {
-            "default": "bgutils",
-            "help": "The Proof of origin provider method.",
-        },
        "extractor_args": {
            "default": {},
            "help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,6 +1,5 @@
 import re
 import mimetypes
-import json

 from loguru import logger
 from slugify import slugify
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group("id")
        return ie_instance._extract_status(twid=twid)

+    def keys_to_clean(self, video_data, info_extractor):
+        return ["user", "created_at", "entities", "favorited", "translator_type"]
+
    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False

-        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
-            timestamp
-        )
+        full_text = tweet.pop("full_text", "")
+        author = tweet["user"].get("name", "")
+        result.set("author", author).set_url(url)
+
+        result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
        if not tweet.get("entities", {}).get("media"):
            logger.debug("No media found, archiving tweet text only")
            result.status = "twitter-ytdl"
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -70,10 +70,14 @@
    - Skips redundant updates for empty or invalid data fields.

    ### Setup
-    - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
-    To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
-    - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
-    - Customize the column names in your Google sheet using the `columns` configuration.
-    - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
+    1. Requires a Google Service Account JSON file for authentication.
+    To set up a service account, follow the instructions in the [how to](https://auto-archiver.readthedocs.io/en/latest/how_to/gsheets_setup.html),
+    or use the script:
+    ```
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/bellingcat/auto-archiver/refs/heads/main/scripts/generate_google_services.sh)"
+    ```
+    2. Create a Google sheet with the required column(s) and then define the `sheet` or `sheet_id` configuration to specify this sheet.
+    3. Customize the column names in your Google sheet using the `columns` configuration.
+    4. The Google Sheet can be used solely as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
    """,
 }
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
    # TODO: links to stories

    def setup(self) -> None:
+        logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
+        logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
+
        self.insta = instaloader.Instaloader(
            download_geotags=True,
            download_comments=True,
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher):
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()

-        if UrlUtil.is_auth_wall(url):
-            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
-            return
-
        logger.debug(f"Enriching screenshot for {url=}")
        auth = self.auth_for_site(url)
+
+        # screenshot enricher only supports cookie-type auth (selenium)
+        has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
+
+        if UrlUtil.is_auth_wall(url) and not has_valid_auth:
+            logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
+            if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
+                logger.warning(
+                    f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
+                               Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
+                )
+            return
+
        with self.webdriver_factory(
            self.width,
            self.height,
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -11,7 +11,7 @@
    "configs": {
        "profile": {
            "default": None,
-            "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
+            "help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
        },
        "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
        "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
@@ -40,14 +40,27 @@
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

-    ### Features
+    ## Setup
+
+    **Docker**
+    If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
+    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
+    the docker daemon to be able to run the `browsertrix-crawler` tool.
+
+    **Browsertrix Profiles**
+    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
+    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
+    for more information.
+
+    ** Docker in Docker **
+    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
+    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
+
+    ## Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
    - Supports custom profiles for archiving private or dynamic content.
    - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

-    ### Notes
-    - Requires Docker for running `browsertrix-crawler` .
-    - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
    """,
 }
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
        self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
        self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")

-        self.cwd_dind = f"/crawls/crawls{random_str(8)}"
+        self.crawl_id = random_str(8)
+        self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
        self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
        self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
        # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):

        url = to_enrich.get_url()

-        collection = random_str(8)
+        collection = self.crawl_id
        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

@@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
                ] + cmd

            if self.profile:
-                profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
+                profile_file = f"profile-{self.crawl_id}.tar.gz"
+                profile_fn = os.path.join(browsertrix_home_container, profile_file)
                logger.debug(f"copying {self.profile} to {profile_fn}")
                shutil.copyfile(self.profile, profile_fn)
-                cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
+                cmd.extend(["--profile", os.path.join("/crawls", profile_file)])

        else:
            logger.debug(f"generating WACZ without Docker for {url=}")