Merge branch 'main' into settings_page

2026-06-11 20:58:29 +03:00 · 2025-03-07 15:17:42 +00:00
parent 158e6be0b1 027985024b
commit 333201acec
21 changed files with 342 additions and 127 deletions
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@@ -10,7 +10,7 @@ class ConsoleDb(Database):
    """

    def started(self, item: Metadata) -> None:
-        logger.warning(f"STARTED {item}")
+        logger.info(f"STARTED {item}")

    def failed(self, item: Metadata, reason:str) -> None:
        logger.error(f"FAILED {item}: {reason}")
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -28,6 +28,13 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
+
+### Auto-Updates
+
+The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
+This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
+If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
+
 """,
    "configs": {
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,10 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "ytdlp_update_interval": {
+            "default": 5,
+            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
+            "type": "int",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,7 +1,11 @@
-import datetime, os, yt_dlp, pysubs2
+import datetime, os
 import importlib
+import subprocess
 from typing import Generator, Type
+
+import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+import pysubs2

 from loguru import logger

@@ -11,6 +15,44 @@ from auto_archiver.core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}

+    def setup(self):
+        # check for file .ytdlp-update in the secrets folder
+        if self.ytdlp_update_interval < 0:
+            return
+        
+        use_secrets = os.path.exists('secrets')
+        path = os.path.join('secrets' if use_secrets else '', '.ytdlp-update')
+        next_update_check = None
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                next_update_check = datetime.datetime.fromisoformat(f.read())
+        
+        if not next_update_check or next_update_check < datetime.datetime.now():
+            self.update_ytdlp()
+
+            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            with open(path, "w") as f:
+                f.write(next_update_check.isoformat())
+
+    def update_ytdlp(self):
+        logger.info("Checking and updating yt-dlp...")
+        logger.info(f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}")
+        from importlib.metadata import version as get_version
+        old_version = get_version("yt-dlp")
+        try:
+            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
+            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+
+            if "Successfully installed yt-dlp" in result.stdout.decode():
+                new_version = importlib.metadata.version("yt-dlp")
+                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
+                importlib.reload(yt_dlp)
+            else:
+                logger.info("yt-dlp already up to date")
+
+        except Exception as e:
+            logger.error(f"Error updating yt-dlp: {e}")
+
    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
@@ -86,7 +128,7 @@ class GenericExtractor(Extractor):
        # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
        result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
        result.set_url(url)
-
+        if "description" in video_data: result.set_content(video_data["description"])
        # extract comments if enabled
        if self.comments:
            result.set("comments", [{
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -64,7 +64,7 @@ class GsheetsFeeder(Feeder):
            yield m

    def _set_context(self, m: Metadata, gw: GWorksheet, row: int) -> Metadata:
-        # TODO: Check folder value not being recognised
+
        m.set_context("gsheet", {"row": row, "worksheet": gw})

        if gw.get_cell_or_default(row, 'folder', "") is None:
--- a/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
@@ -17,6 +17,7 @@ class GWorksheet:
        'thumbnail': 'thumbnail',
        'timestamp': 'upload timestamp',
        'title': 'upload title',
+        'text': 'text content',
        'screenshot': 'screenshot',
        'hash': 'hash',
        'pdq_hash': 'perceptual hashes',
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@@ -10,25 +10,30 @@
    "requires_setup": True,
    "configs": {
        "username": {"required": True,
-                     "help": "a valid Instagram username"},
+                     "help": "A valid Instagram username."},
        "password": {
            "required": True,
-            "help": "the corresponding Instagram account password",
+            "help": "The corresponding Instagram account password.",
        },
        "download_folder": {
            "default": "instaloader",
-            "help": "name of a folder to temporarily download content to",
+            "help": "Name of a folder to temporarily download content to.",
        },
        "session_file": {
            "default": "secrets/instaloader.session",
-            "help": "path to the instagram session which saves session credentials",
+            "help": "Path to the instagram session file which saves session credentials. If one doesn't exist this gives the path to store a new one.",
        },
        # TODO: fine-grain
        # "download_stories": {"default": True, "help": "if the link is to a user profile: whether to get stories information"},
    },
    "description": """
-    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. This class handles both individual posts
-    and user profiles, downloading as much information as possible, including images, videos, text, stories,
+    Uses the [Instaloader library](https://instaloader.github.io/as-module.html) to download content from Instagram. 
+    
+      > ⚠️ **Warning**  
+      > This module is not actively maintained due to known issues with blocking.  
+      > Prioritise usage of the [Instagram Tbot Extractor](./instagram_tbot_extractor.md) and [Instagram API Extractor](./instagram_api_extractor.md)
+  
+    This class handles both individual posts and user profiles, downloading as much information as possible, including images, videos, text, stories,
    highlights, and tagged posts. 
    Authentication is required via username/password or a session file.
                    
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -3,7 +3,7 @@
    highlights, and tagged posts. Authentication is required via username/password or a session file.

 """
-import re, os, shutil, traceback
+import re, os, shutil
 import instaloader
 from loguru import logger

@@ -15,10 +15,9 @@ class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
+
    # NB: post regex should be tested before profile
-
    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
-
    # https://regex101.com/r/MGPquX/1
    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
@@ -28,19 +27,22 @@ class InstagramExtractor(Extractor):
    def setup(self) -> None:

        self.insta = instaloader.Instaloader(
-            download_geotags=True, download_comments=True, compress_json=False, dirname_pattern=self.download_folder, filename_pattern="{date_utc}_UTC_{target}__{typename}"
+            download_geotags=True,
+            download_comments=True,
+            compress_json=False,
+            dirname_pattern=self.download_folder,
+            filename_pattern="{date_utc}_UTC_{target}__{typename}"
        )
        try:
            self.insta.load_session_from_file(self.username, self.session_file)
        except Exception as e:
-            logger.error(f"Unable to login from session file: {e}\n{traceback.format_exc()}")
            try:
-                self.insta.login(self.username, config.instagram_self.password)
-                # TODO: wait for this issue to be fixed https://github.com/instaloader/instaloader/issues/1758
+                logger.debug(f"Session file failed", exc_info=True)
+                logger.info("No valid session file found - Attempting login with use and password.")
+                self.insta.login(self.username, self.password)
                self.insta.save_session_to_file(self.session_file)
-            except Exception as e2:
-                logger.error(f"Unable to finish login (retrying from file): {e2}\n{traceback.format_exc()}")
-
+            except Exception as e:
+                logger.error(f"Failed to setup Instagram Extractor with Instagrapi. {e}")


    def download(self, item: Metadata) -> Metadata:
--- a/src/auto_archiver/modules/screenshot_enricher/manifest.py
+++ b/src/auto_archiver/modules/screenshot_enricher/manifest.py
@@ -9,7 +9,7 @@
            "width": {"default": 1280,
                      "type": "int",
                      "help": "width of the screenshots"},
-            "height": {"default": 720,
+            "height": {"default": 1024,
                        "type": "int",
                       "help": "height of the screenshots"},
            "timeout": {"default": 60,