Further tidyups + refactoring for new structure

* Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file *
2026-06-13 05:38:29 +03:00 · 2025-01-30 13:21:10 +01:00
parent cddae65a90
commit b7d9145f6c
22 changed files with 292 additions and 51 deletions
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -2,8 +2,11 @@ import re
 from urllib.parse import urlparse, urlunparse

 class UrlUtil:
-    telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
-    is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
+
+    AUTHWALL_URLS = [
+        re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
+        re.compile(r"https:\/\/www\.instagram\.com"), # instagram
+    ]

    @staticmethod
    def clean(url: str) -> str: return url
@@ -13,8 +16,9 @@ class UrlUtil:
        """
        checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
        """
-        if UrlUtil.telegram_private.match(url): return True
-        if UrlUtil.is_istagram.match(url): return True
+        for regex in UrlUtil.AUTHWALL_URLS:
+            if regex.match(url):
+                return True

        return False