Further tidyups + refactoring for new structure

* Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file *
2026-06-13 05:38:29 +03:00 · 2025-01-30 13:21:10 +01:00
parent cddae65a90
commit b7d9145f6c
22 changed files with 292 additions and 51 deletions
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_extractor.py
@@ -28,7 +28,7 @@ class InstagramAPIExtractor(Extractor):
    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """

-    global_pattern = re.compile(
+    valid_url = re.compile(
        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
    )

@@ -44,7 +44,7 @@ class InstagramAPIExtractor(Extractor):
        url.replace("instagr.com", "instagram.com").replace(
            "instagr.am", "instagram.com"
        )
-        insta_matches = self.global_pattern.findall(url)
+        insta_matches = self.valid_url.findall(url)
        logger.info(f"{insta_matches=}")
        if not len(insta_matches) or len(insta_matches[0]) != 3:
            return
--- a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -16,10 +16,13 @@ class InstagramExtractor(Extractor):
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
    # NB: post regex should be tested before profile
+
+    valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
+
    # https://regex101.com/r/MGPquX/1
-    post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
+    post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
    # https://regex101.com/r/6Wbsxa/1
-    profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
+    profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
    # TODO: links to stories

    def setup(self, config: dict) -> None:
--- a/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_extractor.py
@@ -14,7 +14,7 @@ from auto_archiver.utils import random_str


 class TelethonArchiver(Extractor):
-    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
+    valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
    invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")


@@ -92,7 +92,7 @@ class TelethonArchiver(Extractor):
        """
        url = item.get_url()
        # detect URLs that we definitely cannot handle
-        match = self.link_pattern.search(url)
+        match = self.valid_url.search(url)
        logger.debug(f"TELETHON: {match=}")
        if not match: return False

--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -12,7 +12,7 @@ from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata,Media

 class TwitterApiExtractor(Extractor):
-    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
+    valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

    def setup(self, config: dict) -> None:
        super().setup(config)
@@ -54,7 +54,7 @@ class TwitterApiExtractor(Extractor):

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
-        matches = self.link_pattern.findall(url)
+        matches = self.valid_url.findall(url)
        if not len(matches): return False, False

        username, tweet_id = matches[0]  # only one URL supported