Merge branch 'main' into settings_page

2026-06-11 12:48:28 +03:00 · 2025-03-07 15:17:42 +00:00
parent 158e6be0b1 027985024b
commit 333201acec
21 changed files with 342 additions and 127 deletions
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@@ -50,7 +50,6 @@ class BaseModule(ABC):

    def config_setup(self, config: dict):

-        authentication = config.get('authentication', {})
        # this is important. Each instance is given its own deepcopied config, so modules cannot
        # change values to affect other modules
        config = deepcopy(config)
@@ -106,8 +105,8 @@ class BaseModule(ABC):
            for key in self.authentication.keys():
                if key in site or site in key:
                    logger.debug(f"Could not find exact authentication information for site '{site}'. \
-                                    did find information for '{key}' which is close, is this what you meant? \
-                                    If so, edit your authentication settings to make sure it exactly matches.")
+did find information for '{key}' which is close, is this what you meant? \
+If so, edit your authentication settings to make sure it exactly matches.")

        def get_ytdlp_cookiejar(args):
            import yt_dlp
@@ -117,7 +116,7 @@ class BaseModule(ABC):
            # collections.namedtuple('ParsedOptions', ('parser', 'options', 'urls', 'ydl_opts'))
            ytdlp_opts = getattr(parse_options(args), 'ydl_opts')
            return yt_dlp.YoutubeDL(ytdlp_opts).cookiejar
-        
+
        get_cookiejar_options = None

        # order of priority:
--- a/src/auto_archiver/core/consts.py
+++ b/src/auto_archiver/core/consts.py
@@ -14,7 +14,7 @@ DEFAULT_MANIFEST = {
    'name': '', # the display name of the module
    'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
    'type': [], # the type of the module, can be one or more of MODULE_TYPES
-    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
+    'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional software
    'description': '', # a description of the module
    'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
    'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -15,6 +15,7 @@ from copy import copy

 from rich_argparse import RichHelpFormatter
 from loguru import logger
+import requests

 from .metadata import Metadata, Media
 from auto_archiver.version import __version__
@@ -335,7 +336,23 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        yaml_config = self.load_config(basic_config.config_file)

        return self.setup_complete_parser(basic_config, yaml_config, unused_args)
+    
+    def check_for_updates(self):
+        response = requests.get("https://pypi.org/pypi/auto-archiver/json").json()
+        latest_version = response['info']['version']
+        # check version compared to current version
+        if latest_version != __version__:
+            if os.environ.get('RUNNING_IN_DOCKER'):
+                update_cmd = "`docker pull bellingcat/auto-archiver:latest`"
+            else:
+                update_cmd = "`pip install --upgrade auto-archiver`"
+            logger.warning("")
+            logger.warning("********* IMPORTANT: UPDATE AVAILABLE ********")
+            logger.warning(f"A new version of auto-archiver is available (v{latest_version}, you have {__version__})")
+            logger.warning(f"Make sure to update to the latest version using: {update_cmd}")
+            logger.warning("")

+        
    def setup(self, args: list):
        """
        Function to configure all setup of the orchestrator: setup configs and load modules.
@@ -343,6 +360,8 @@ Here's how that would look: \n\nsteps:\n  extractors:\n  - [your_extractor_name_
        This method should only ever be called once
        """

+        self.check_for_updates()
+
        if self.setup_finished:
            logger.warning("The `setup_config()` function should only ever be run once. \
                           If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
    """

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")

    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}: {reason}")
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -28,6 +28,13 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
+
+### Auto-Updates
+
+The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
+This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
+If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
+
 """,
    "configs": {
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "ytdlp_update_interval": {
+            "default": 5,
+            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
+            "type": "int",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,7 +1,11 @@
-import datetime, os, yt_dlp, pysubs2
+import datetime, os
 import importlib
+import subprocess
 from typing import Generator, Type
+
+import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+import pysubs2

 from loguru import logger

@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

+    def setup(self):
+        # check for file .ytdlp-update in the secrets folder
+        if self.ytdlp_update_interval < 0:
+            return
+        
+        use_secrets = os.path.exists('secrets')
+        path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
+        next_update_check = None
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                next_update_check = datetime.datetime.fromisoformat(f.read())
+        
+        if not next_update_check or next_update_check < datetime.datetime.now():
+            self.update_ytdlp()
+
+            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            with open(path, "w") as f:
+                f.write(next_update_check.isoformat())
+
+    def update_ytdlp(self):
+        logger.info("Checking and updating yt-dlp...")
+        logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
+        from importlib.metadata import version as get_version
+        old_version = get_version("yt-dlp")
+        try:
+            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
+            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+
+            if "Successfully installed yt-dlp" in result.stdout.decode():
+                new_version = importlib.metadata.version("yt-dlp")
+                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
+                importlib.reload(yt_dlp)
+            else:
+                logger.info("yt-dlp already up to date")
+
+        except Exception as e:
+            logger.error(f"Error updating yt-dlp: {e}")
+
    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
@@ -86,7 +128,7 @@ class GenericExtractor(Extractor):
        # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
        result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
        result.set_url(url)
-
+        if "description" in video_data: result.set_content(video_data["description"])
        # extract comments if enabled
        if self.comments:
            result.set("comments", [{
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -64,7 +64,7 @@ class GsheetsFeeder(Feeder):
            yield m

    def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
-        # TODO: Check folder value not being recognised
+
        m.set_context("gsheet", {"row": row, "worksheet": gw})

        if gw.get_cell_or_default(row, 'folder', "") is None:
--- a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
@@ -17,6 +17,7 @@ class GWorksheet:
        'thumbnail': 'thumbnail',
        'timestamp': 'upload timestamp',
        'title': 'upload title',
+        'text': 'text content',
        'screenshot': 'screenshot',
        'hash': 'hash',
        'pdq_hash': 'perceptual hashes',
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@@ -10,25 +10,30 @@
    "requires_setup": True,
    "configs": {
        "username": {"required": True,
-                     "help": "a valid Instagram username"},
+                     "help": "A valid Instagram username."},
        "password": {
            "required": True,
-            "help": "the corresponding Instagram account password",
+            "help": "The corresponding Instagram account password.",
        },
        "download_folder": {
            "default": "instaloader",
-            "help": "name of a folder to temporarily download content to",
+            "help": "Name of a folder to temporarily download content to.",
        },
        "session_file": {
            "default": "secrets/instaloader.session",
-            "help": "path to the instagram session which saves session credentials",
+            "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
        },
        # TODO: fine-grain
        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
    },
    "description": """
-    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
-    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. 
+    
+      > ⚠️ **Warning**  
+      > This module is not actively maintained due to known issues with blocking.  
+      > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
+  
+    This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
    highlights, and tagged posts. 
    Authentication is required via username/password or a session file.
                    
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -3,7 +3,7 @@
    highlights, and tagged posts. Authentication is required via username/password or a session file.

 """
-import re, os, shutil, traceback
+import re, os, shutil
 import instaloader
 from loguru import logger

@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
+
    # NB: post regex should be tested before profile
-
    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
-
    # https://regex101.com/r/MGPquX/1
    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
    def setup(self) -> None:

        self.insta = instaloader.Instaloader(
-            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
+            download_geotags=True,
+            download_comments=True,
+            compress_json=False,
+            dirname_pattern=self.download_folder,
+            filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
        try:
            self.insta.load_session_from_file(self.username, self.session_file)
        except Exception as e:
-            logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
            try:
-                self.insta.login(self.username, config.instagram_self.password)
-                # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+                logger.debug(f"Session file failed", exc_info=True)
+                logger.info("No valid session file found - Attempting login with use and password.")
+                self.insta.login(self.username, self.password)
                self.insta.save_session_to_file(self.session_file)
-            except Exception as e2:
-                logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
-
+            except Exception as e:
+                logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")


    def download(self, item: Metadata) -> Metadata:
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -9,7 +9,7 @@
            "width": {"default": 1280,
                      "type": "int",
                      "help": "width of the screenshots"},
-            "height": {"default": 720,
+            "height": {"default": 1024,
                        "type": "int",
                       "help": "height of the screenshots"},
            "timeout": {"default": 60,
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -1,9 +1,11 @@
-import os
+import hashlib
 import json
+import os
 import uuid
 from datetime import datetime, timezone
+from dateutil.parser import parse as parse_dt
+
 import requests
-import hashlib
 from loguru import logger


@@ -68,26 +70,34 @@ def calculate_file_hash(filename: str, hash_algo = hashlib.sha256, chunksize: in
            hash.update(buf)
    return hash.hexdigest()

-def get_current_datetime_iso() -> str:
-    return datetime.now(timezone.utc).replace(tzinfo=timezone.utc).isoformat()

+def get_datetime_from_str(dt_str: str, fmt: str | None = None, dayfirst=True) -> datetime | None:
+    """ parse a datetime string with option of passing a specific format

-def get_datetime_from_str(dt_str: str, fmt: str | None = None) -> datetime | None:
-    # parse a datetime string with option of passing a specific format
+    Args:
+        dt_str: the datetime string to parse
+        fmt: the python date format of the datetime string, if None, dateutil.parser.parse is used
+        dayfirst: Use this to signify between date formats which put the day first, vs the month first:
+                    e.g. DD/MM/YYYY vs MM/DD/YYYY
+    """
    try:
-        return datetime.strptime(dt_str, fmt) if fmt else datetime.fromisoformat(dt_str)
+        return datetime.strptime(dt_str, fmt) if fmt else parse_dt(dt_str, dayfirst=dayfirst)
    except ValueError as e:
        logger.error(f"Unable to parse datestring {dt_str}: {e}")
        return None


-def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
-    # Consistent parsing of timestamps
-    # If utc=True, the timezone is set to UTC,
-    # if iso=True, the output is an iso string
+def get_timestamp(ts, utc=True, iso=True, dayfirst=True) -> str | datetime | None:
+    """  Consistent parsing of timestamps.
+    Args:
+         If utc=True, the timezone is set to UTC,
+         if iso=True, the output is an iso string
+         Use dayfirst to signify between date formats which put the date vs month first:
+         e.g. DD/MM/YYYY vs MM/DD/YYYY
+     """
    if not ts: return
    try:
-        if isinstance(ts, str): ts = datetime.fromisoformat(ts)
+        if isinstance(ts, str): ts = parse_dt(ts, dayfirst=dayfirst)
        if isinstance(ts, (int, float)): ts = datetime.fromtimestamp(ts)
        if utc: ts = ts.replace(tzinfo=timezone.utc)
        if iso: return ts.isoformat()
@@ -96,5 +106,6 @@ def get_timestamp(ts, utc=True, iso=True) -> str | datetime | None:
        logger.error(f"Unable to parse timestamp {ts}: {e}")
        return None

+
 def get_current_timestamp() -> str:
-    return get_timestamp(datetime.now())
+    return get_timestamp(datetime.now())
--- a/src/auto_archiver/utils/webdriver.py
+++ b/src/auto_archiver/utils/webdriver.py
@@ -1,18 +1,23 @@
 """ This Webdriver class acts as a context manager for the selenium webdriver. """
 from __future__ import annotations
-from selenium import webdriver
-from selenium.common.exceptions import TimeoutException
-from selenium.webdriver.common.proxy import Proxy, ProxyType
-from selenium.webdriver.common.print_page_options import PrintOptions

-from loguru import logger
-from selenium.webdriver.common.by import By
+import os
 import time

 #import domain_for_url
 from urllib.parse import urlparse, urlunparse
 from http.cookiejar import MozillaCookieJar

+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common import exceptions as selenium_exceptions
+from selenium.webdriver.common.print_page_options import PrintOptions
+from selenium.webdriver.common.by import By
+
+from loguru import logger
+
+
 class CookieSettingDriver(webdriver.Firefox):

    facebook_accept_cookies: bool
@@ -20,6 +25,10 @@ class CookieSettingDriver(webdriver.Firefox):
    cookiejar: MozillaCookieJar

    def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
+        if os.environ.get('RUNNING_IN_DOCKER'):
+            # Selenium doesn't support linux-aarch64 driver, we need to set this manually
+            kwargs['service'] = webdriver.FirefoxService(executable_path='/usr/local/bin/geckodriver')
+        
        super(CookieSettingDriver, self).__init__(*args, **kwargs)
        self.cookies = cookies
        self.cookiejar = cookiejar
@@ -64,14 +73,29 @@ class CookieSettingDriver(webdriver.Firefox):
                time.sleep(2)
            except Exception as e:
                logger.warning(f'Failed on fb accept cookies.', e)
+        
+
        # now get the actual URL
        super(CookieSettingDriver, self).get(url)
        if self.facebook_accept_cookies:
            # try and click the 'close' button on the 'login' window to close it
-            close_button = self.find_element(By.XPATH, "//div[@role='dialog']//div[@aria-label='Close']")
-            if close_button:
-                close_button.click()
+            try:
+                xpath = "//div[@role='dialog']//div[@aria-label='Close']"
+                WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
+            except selenium_exceptions.NoSuchElementException:
+                logger.warning("Unable to find the 'close' button on the facebook login window")
+                pass

+        else:
+
+            # for all other sites, try and use some common button text to reject/accept cookies
+            for text in ["Refuse non-essential cookies", "Decline optional cookies", "Reject additional cookies", "Accept all cookies"]:
+                try:
+                    xpath = f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{text.lower()}')]"
+                    WebDriverWait(self, 5).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()
+                    break
+                except selenium_exceptions.WebDriverException:
+                    pass

    
 class Webdriver:
@@ -90,7 +114,6 @@ class Webdriver:
            setattr(self.print_options, k, v)

    def __enter__(self) -> webdriver:
-
        options = webdriver.FirefoxOptions()
        options.add_argument("--headless")
        options.add_argument(f'--proxy-server={self.http_proxy}')
@@ -105,7 +128,7 @@ class Webdriver:
            self.driver.set_window_size(self.width, self.height)
            self.driver.set_page_load_timeout(self.timeout_seconds)
            self.driver.print_options = self.print_options
-        except TimeoutException as e:
+        except selenium_exceptions.TimeoutException as e:
            logger.error(f"failed to get new webdriver, possibly due to insufficient system resources or timeout settings: {e}")

        return self.driver