Set up screenshot enricher to use authentication/cookies

2026-06-13 05:38:29 +03:00 · 2025-02-03 17:25:59 +01:00
parent 7ec328ab40
commit c574b694ed
11 changed files with 153 additions and 96 deletions
--- a/src/auto_archiver/utils/url.py
+++ b/src/auto_archiver/utils/url.py
@@ -1,83 +1,84 @@
 import re
 from urllib.parse import urlparse, urlunparse

-class UrlUtil:

-    AUTHWALL_URLS = [
-        re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
-        re.compile(r"https:\/\/www\.instagram\.com"), # instagram
-    ]
+AUTHWALL_URLS = [
+    re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
+    re.compile(r"https:\/\/www\.instagram\.com"), # instagram
+]

-    @staticmethod
-    def clean(url: str) -> str: return url
+def domain_for_url(url: str) -> str:
+    """
+    SECURITY: parse the domain using urllib to avoid any potential security issues
+    """
+    return urlparse(url).netloc

-    @staticmethod
-    def is_auth_wall(url: str) -> bool:
-        """
-        checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
-        """
-        for regex in UrlUtil.AUTHWALL_URLS:
-            if regex.match(url):
-                return True
+def clean(url: str) -> str:
+    return url

-        return False
+def is_auth_wall(url: str) -> bool:
+    """
+    checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
+    """
+    for regex in AUTHWALL_URLS:
+        if regex.match(url):
+            return True

-    @staticmethod
-    def remove_get_parameters(url: str) -> str:
-        # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
-        # useful for mimetypes to work
-        parsed_url = urlparse(url)
-        new_url = urlunparse(parsed_url._replace(query=''))
-        return new_url
+    return False

-    @staticmethod
-    def is_relevant_url(url: str) -> bool:
-        """
-        Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
-        """
-        clean_url = UrlUtil.remove_get_parameters(url)
+def remove_get_parameters(url: str) -> str:
+    # http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
+    # useful for mimetypes to work
+    parsed_url = urlparse(url)
+    new_url = urlunparse(parsed_url._replace(query=''))
+    return new_url

-        # favicons
-        if "favicon" in url: return False
-        # ifnore icons
-        if clean_url.endswith(".ico"): return False
-        # ignore SVGs
-        if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
+def is_relevant_url(url: str) -> bool:
+    """
+    Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
+    """
+    clean_url = remove_get_parameters(url)

-        # twitter profile pictures
-        if "twimg.com/profile_images" in url: return False
-        if "twimg.com" in url and "/default_profile_images" in url: return False
+    # favicons
+    if "favicon" in url: return False
+    # ifnore icons
+    if clean_url.endswith(".ico"): return False
+    # ignore SVGs
+    if remove_get_parameters(url).endswith(".svg"): return False

-        # instagram profile pictures
-        if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
-        # instagram recurring images
-        if "https://static.cdninstagram.com/rsrc.php/" in url: return False
+    # twitter profile pictures
+    if "twimg.com/profile_images" in url: return False
+    if "twimg.com" in url and "/default_profile_images" in url: return False

-        # telegram
-        if "https://telegram.org/img/emoji/" in url: return False
+    # instagram profile pictures
+    if "https://scontent.cdninstagram.com/" in url and "150x150" in url: return False
+    # instagram recurring images
+    if "https://static.cdninstagram.com/rsrc.php/" in url: return False

-        # youtube
-        if "https://www.youtube.com/s/gaming/emoji/" in url: return False
-        if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
-        if "https://www.youtube.com/s/search/audio/" in url: return False
+    # telegram
+    if "https://telegram.org/img/emoji/" in url: return False

-        # ok
-        if " https://ok.ru/res/i/" in url: return False
+    # youtube
+    if "https://www.youtube.com/s/gaming/emoji/" in url: return False
+    if "https://yt3.ggpht.com" in url and "default-user=" in url: return False
+    if "https://www.youtube.com/s/search/audio/" in url: return False

-        # vk
-        if "https://vk.com/emoji/" in url: return False
-        if "vk.com/images/" in url: return False
-        if "vk.com/images/reaction/" in url: return False
+    # ok
+    if " https://ok.ru/res/i/" in url: return False

-        # wikipedia
-        if "wikipedia.org/static" in url: return False
+    # vk
+    if "https://vk.com/emoji/" in url: return False
+    if "vk.com/images/" in url: return False
+    if "vk.com/images/reaction/" in url: return False

-        return True
+    # wikipedia
+    if "wikipedia.org/static" in url: return False

-    @staticmethod
-    def twitter_best_quality_url(url: str) -> str:
-        """
-        some twitter image URLs point to a less-than best quality
-        this returns the URL pointing to the highest (original) quality
-        """
-        return re.sub(r"name=(\w+)", "name=orig", url, 1)
+    return True
+
+def twitter_best_quality_url(url: str) -> str:
+    """
+    some twitter image URLs point to a less-than best quality
+    this returns the URL pointing to the highest (original) quality
+    """
+    return re.sub(r"name=(\w+)", "name=orig", url, 1)