Merge branch 'main' into feat/yt-dlp-pots

# Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py # tests/test_modules.py
2026-06-12 21:28:29 +03:00 · 2025-03-25 15:16:31 +00:00
parent 93921e71d4 5c6005d843
commit b4c33318c4
28 changed files with 586 additions and 121 deletions
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -71,7 +71,16 @@ class BaseModule(ABC):
        :param site: the domain of the site to get authentication information for
        :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).

-        :returns: authdict dict of login information for the given site
+        :returns: authdict dict -> {
+            "username": str,
+            "password": str,
+            "api_key": str,
+            "api_secret": str,
+            "cookie": str,
+            "cookies_file": str,
+            "cookies_from_browser": str,
+            "cookies_jar": CookieJar
+        }

        **Global options:**\n
        * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
@@ -85,6 +94,7 @@ class BaseModule(ABC):
        * cookie: str - a cookie string to use for login (specific to this site)\n
        * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
        * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
+
        """
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
        # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@@ -5,6 +5,7 @@ by handling user configuration, validating the steps properties, and implementin
 """

 from __future__ import annotations
+import subprocess

 from dataclasses import dataclass
 from typing import List, TYPE_CHECKING, Type
@@ -17,7 +18,7 @@ import os
 from os.path import join
 from loguru import logger
 import auto_archiver
-from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
+from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError

 if TYPE_CHECKING:
    from .base_module import BaseModule
@@ -85,7 +86,11 @@ class ModuleFactory:
        if not available:
            message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
            if "archiver" in module_name:
-                message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
+                message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
+            elif "gsheet" in module_name:
+                message += " Did you mean 'gsheet_feeder_db'?"
+            elif "atlos" in module_name:
+                message += " Did you mean 'atlos_feeder_db_storage'?"
            raise IndexError(message)
        return available[0]

@@ -216,9 +221,9 @@ class LazyBaseModule:
                if not check(dep):
                    logger.error(
                        f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
-                                 Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
+                                 Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
                    )
-                    exit(1)
+                    raise SetupError()

        def check_python_dep(dep):
            # first check if it's a module:
@@ -237,8 +242,22 @@ class LazyBaseModule:

            return find_spec(dep)

+        def check_bin_dep(dep):
+            dep_exists = shutil.which(dep)
+
+            if dep == "docker":
+                if os.environ.get("RUNNING_IN_DOCKER"):
+                    # this is only for the WACZ enricher, which requires docker
+                    # if we're already running in docker then we don't need docker
+                    return True
+
+                # check if docker daemon is running
+                return dep_exists and subprocess.run(["docker", "ps", "-q"]).returncode == 0
+
+            return dep_exists
+
        check_deps(self.dependencies.get("python", []), check_python_dep)
-        check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep))
+        check_deps(self.dependencies.get("bin", []), check_bin_dep)

        logger.debug(f"Loading module '{self.display_name}'...")

--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
                if module in invalid_modules:
                    continue

+                # check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
+                lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
+                if module_type not in lazy_module.type:
+                    types = ",".join(f"'{t}'" for t in lazy_module.type)
+                    raise SetupError(
+                        f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
+                    )
+
                loaded_module = None
                try:
-                    loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
+                    loaded_module: BaseModule = lazy_module.load(self.config)
                except (KeyboardInterrupt, Exception) as e:
                    if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
                        logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -74,10 +74,6 @@ If you are having issues with the extractor, you can review the version of `yt-d
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
-        "pot_provider": {
-            "default": "bgutils",
-            "help": "The Proof of origin provider method.",
-        },
        "extractor_args": {
            "default": {},
            "help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,6 +1,5 @@
 import re
 import mimetypes
-import json

 from loguru import logger
 from slugify import slugify
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group("id")
        return ie_instance._extract_status(twid=twid)

+    def keys_to_clean(self, video_data, info_extractor):
+        return ["user", "created_at", "entities", "favorited", "translator_type"]
+
    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False

-        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
-            timestamp
-        )
+        full_text = tweet.pop("full_text", "")
+        author = tweet["user"].get("name", "")
+        result.set("author", author).set_url(url)
+
+        result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
        if not tweet.get("entities", {}).get("media"):
            logger.debug("No media found, archiving tweet text only")
            result.status = "twitter-ytdl"
--- a/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder_db/manifest.py
@@ -70,10 +70,14 @@
    - Skips redundant updates for empty or invalid data fields.

    ### Setup
-    - Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
-    To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
-    - Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
-    - Customize the column names in your Google sheet using the `columns` configuration.
-    - The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
+    1. Requires a Google Service Account JSON file for authentication.
+    To set up a service account, follow the instructions in the [how to](https://auto-archiver.readthedocs.io/en/latest/how_to/gsheets_setup.html),
+    or use the script:
+    ```
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/bellingcat/auto-archiver/refs/heads/main/scripts/generate_google_services.sh)"
+    ```
+    2. Create a Google sheet with the required column(s) and then define the `sheet` or `sheet_id` configuration to specify this sheet.
+    3. Customize the column names in your Google sheet using the `columns` configuration.
+    4. The Google Sheet can be used solely as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
    """,
 }
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
    # TODO: links to stories

    def setup(self) -> None:
+        logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
+        logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
+
        self.insta = instaloader.Instaloader(
            download_geotags=True,
            download_comments=True,
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher):
    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()

-        if UrlUtil.is_auth_wall(url):
-            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
-            return
-
        logger.debug(f"Enriching screenshot for {url=}")
        auth = self.auth_for_site(url)
+
+        # screenshot enricher only supports cookie-type auth (selenium)
+        has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
+
+        if UrlUtil.is_auth_wall(url) and not has_valid_auth:
+            logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
+            if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
+                logger.warning(
+                    f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
+                               Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
+                )
+            return
+
        with self.webdriver_factory(
            self.width,
            self.height,
--- a/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/manifest.py
@@ -11,7 +11,7 @@
    "configs": {
        "profile": {
            "default": None,
-            "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
+            "help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
        },
        "docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
        "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
@@ -40,14 +40,27 @@
    Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
    [Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.

-    ### Features
+    ## Setup
+
+    **Docker**
+    If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
+    Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run 
+    the docker daemon to be able to run the `browsertrix-crawler` tool.
+
+    **Browsertrix Profiles**
+    A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
+    You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
+    for more information.
+
+    ** Docker in Docker **
+    If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
+    This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
+
+    ## Features
    - Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
    - Supports custom profiles for archiving private or dynamic content.
    - Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
    - Generates metadata from the archived page's content and structure (e.g., titles, text).

-    ### Notes
-    - Requires Docker for running `browsertrix-crawler` .
-    - Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
    """,
 }
--- a/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
+++ b/src/auto_archiver/modules/wacz_extractor_enricher/wacz_extractor_enricher.py
@@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
        self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
        self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")

-        self.cwd_dind = f"/crawls/crawls{random_str(8)}"
+        self.crawl_id = random_str(8)
+        self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
        self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
        self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
        # create crawls folder if not exists, so it can be safely removed in cleanup
@@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):

        url = to_enrich.get_url()

-        collection = random_str(8)
+        collection = self.crawl_id
        browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
        browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host

@@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
                ] + cmd

            if self.profile:
-                profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
+                profile_file = f"profile-{self.crawl_id}.tar.gz"
+                profile_fn = os.path.join(browsertrix_home_container, profile_file)
                logger.debug(f"copying {self.profile} to {profile_fn}")
                shutil.copyfile(self.profile, profile_fn)
-                cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
+                cmd.extend(["--profile", os.path.join("/crawls", profile_file)])

        else:
            logger.debug(f"generating WACZ without Docker for {url=}")
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -4,8 +4,8 @@ from ipaddress import ip_address


 AUTHWALL_URLS = [
-    re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"),  # telegram private channels
-    re.compile(r"https:\/\/www\.instagram\.com"),  # instagram
+    re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"),  # telegram private channels
+    re.compile(r"https?:\/\/(www\.)?instagram\.com"),  # instagram
 ]


@@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
    """
    clean_url = remove_get_parameters(url)

-    # favicons
-    if "favicon" in url:
-        return False
-    # ifnore icons
-    if clean_url.endswith(".ico"):
-        return False
-    # ignore SVGs
-    if remove_get_parameters(url).endswith(".svg"):
-        return False
+    IRRELEVANT_URLS = [
+        # favicons
+        ("favicon",),
+        # twitter profile pictures
+        ("twimg.com/profile_images",),
+        ("twimg.com", "default_profile_images"),
+        # instagram profile pictures
+        ("https://scontent.cdninstagram.com/", "150x150"),
+        # instagram recurring images
+        ("https://static.cdninstagram.com/rsrc.php/",),
+        # telegram
+        ("https://telegram.org/img/emoji/",),
+        # youtube
+        ("https://www.youtube.com/s/gaming/emoji/",),
+        ("https://yt3.ggpht.com", "default-user="),
+        ("https://www.youtube.com/s/search/audio/",),
+        # ok
+        ("https://ok.ru/res/i/",),
+        ("https://vk.com/emoji/",),
+        ("vk.com/images/",),
+        ("vk.com/images/reaction/",),
+        # wikipedia
+        ("wikipedia.org/static",),
+    ]

-    # twitter profile pictures
-    if "twimg.com/profile_images" in url:
-        return False
-    if "twimg.com" in url and "/default_profile_images" in url:
-        return False
+    IRRELEVANT_ENDS_WITH = [
+        ".svg",  # ignore SVGs
+        ".ico",  # ignore icons
+    ]

-    # instagram profile pictures
-    if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
-        return False
-    # instagram recurring images
-    if "https://static.cdninstagram.com/rsrc.php/" in url:
-        return False
+    for end in IRRELEVANT_ENDS_WITH:
+        if clean_url.endswith(end):
+            return False

-    # telegram
-    if "https://telegram.org/img/emoji/" in url:
-        return False
-
-    # youtube
-    if "https://www.youtube.com/s/gaming/emoji/" in url:
-        return False
-    if "https://yt3.ggpht.com" in url and "default-user=" in url:
-        return False
-    if "https://www.youtube.com/s/search/audio/" in url:
-        return False
-
-    # ok
-    if " https://ok.ru/res/i/" in url:
-        return False
-
-    # vk
-    if "https://vk.com/emoji/" in url:
-        return False
-    if "vk.com/images/" in url:
-        return False
-    if "vk.com/images/reaction/" in url:
-        return False
-
-    # wikipedia
-    if "wikipedia.org/static" in url:
-        return False
+    for parts in IRRELEVANT_URLS:
+        if all(part in clean_url for part in parts):
+            return False

    return True

--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -22,35 +22,35 @@ from loguru import logger

 class CookieSettingDriver(webdriver.Firefox):
    facebook_accept_cookies: bool
-    cookies: str
-    cookiejar: MozillaCookieJar
+    cookie: str
+    cookie_jar: MozillaCookieJar

-    def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
+    def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
        if os.environ.get("RUNNING_IN_DOCKER"):
            # Selenium doesn't support linux-aarch64 driver, we need to set this manually
            kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")

        super(CookieSettingDriver, self).__init__(*args, **kwargs)
-        self.cookies = cookies
-        self.cookiejar = cookiejar
+        self.cookie = cookie
+        self.cookie_jar = cookie_jar
        self.facebook_accept_cookies = facebook_accept_cookies

    def get(self, url: str):
-        if self.cookies or self.cookiejar:
+        if self.cookie_jar or self.cookie:
            # set up the driver to make it not 'cookie averse' (needs a context/URL)
            # get the 'robots.txt' file which should be quick and easy
            robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
            super(CookieSettingDriver, self).get(robots_url)

-            if self.cookies:
+            if self.cookie:
                # an explicit cookie is set for this site, use that first
                for cookie in self.cookies.split(";"):
                    for name, value in cookie.split("="):
                        self.driver.add_cookie({"name": name, "value": value})
-            elif self.cookiejar:
-                domain = urlparse(url).netloc
+            elif self.cookie_jar:
+                domain = urlparse(url).netloc.removeprefix("www.")
                regex = re.compile(f"(www)?.?{domain}$")
-                for cookie in self.cookiejar:
+                for cookie in self.cookie_jar:
                    if regex.match(cookie.domain):
                        try:
                            self.add_cookie(
@@ -145,8 +145,8 @@ class Webdriver:

        try:
            self.driver = CookieSettingDriver(
-                cookies=self.auth.get("cookies"),
-                cookiejar=self.auth.get("cookies_jar"),
+                cookie=self.auth.get("cookie"),
+                cookie_jar=self.auth.get("cookies_jar"),
                facebook_accept_cookies=self.facebook_accept_cookies,
                options=options,
            )