Merge branch 'main' into feat/yt-dlp-pots

2026-06-11 20:58:29 +03:00 · 2025-03-18 20:10:38 +00:00
parent c4e63ebd8c 7d972ee9b8
commit ba9d67e4bb
9 changed files with 62 additions and 21 deletions
--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@@ -59,9 +59,18 @@ class GenericDropin:
        """
        return metadata

-    def is_suitable(self, url, info_extractor: InfoExtractor):
+    def suitable(self, url, info_extractor: InfoExtractor):
        """
-        Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
-        (based on being able to parse other URLs)
+        A method to allow dropins to override their InfoExtractor's 'suitable' method.
+        Dropins should override this method and return True if the url is suitable for the extractor
+        (based on being able to parse other URLs). See the `suitable_extractors` method in the
+        `GenericExtractor` class for how this is implemented.
+
+        The default behaviour of this method is to return the result of the InfoExtractor's 'suitable' method.
+
+        ### Example: An example of where this is useful is for the FacebookIE extractor in yt-dlp. By default,
+        it's 'suitable' method only returns True for video URLs. However, we can override this method in the
+        Facebook dropin to return True for all Facebook URLs (photo/post types). This way, the Facebook dropin
+        can be used for all Facebook URLs.
        """
-        return False
+        return info_extractor.suitable(url)
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -142,7 +142,7 @@ class Facebook(GenericDropin):
        result.set_url(url)
        return result

-    def is_suitable(self, url, info_extractor: FacebookIE):
+    def suitable(self, url, info_extractor: FacebookIE):
        regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
        return re.match(regex, url)

--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -13,6 +13,8 @@ from loguru import logger

 from auto_archiver.core.extractor import Extractor
 from auto_archiver.core import Metadata, Media
+from auto_archiver.utils import get_datetime_from_str
+from .dropin import GenericDropin


 class SkipYtdlp(Exception):
@@ -95,14 +97,11 @@ class GenericExtractor(Extractor):
                continue

            # check if there's a dropin and see if that declares whether it's suitable
-            dropin = self.dropin_for_name(info_extractor.ie_key())
-            if dropin and dropin.is_suitable(url, info_extractor):
+            dropin: GenericDropin = self.dropin_for_name(info_extractor.ie_key())
+            if dropin and dropin.suitable(url, info_extractor):
                yield info_extractor
-                continue
-
-            if info_extractor.suitable(url):
+            elif info_extractor.suitable(url):
                yield info_extractor
-                continue

    def suitable(self, url: str) -> bool:
        """
@@ -249,7 +248,7 @@ class GenericExtractor(Extractor):
            timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"):
-            upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
+            upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
            result.set("upload_date", upload_date)

        # then clean away any keys we don't want
@@ -324,7 +323,7 @@ class GenericExtractor(Extractor):

        return self.add_metadata(data, info_extractor, url, result)

-    def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
+    def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> GenericDropin:
        dropin_name = dropin_name.lower()

        if dropin_name == "generic":
--- a/src/auto_archiver/modules/generic_extractor/tiktok.py
+++ b/src/auto_archiver/modules/generic_extractor/tiktok.py
@@ -1,5 +1,8 @@
 import requests
 from loguru import logger
+
+from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
+
 from auto_archiver.core import Metadata, Media
 from datetime import datetime, timezone
 from .dropin import GenericDropin
@@ -13,6 +16,11 @@ class Tiktok(GenericDropin):

    TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"

+    def suitable(self, url, info_extractor) -> bool:
+        """This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
+        Return the 'suitable' method from the TikTokIE class."""
+        return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
+
    def extract_post(self, url: str, ie_instance):
        logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")

--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,13 +1,12 @@
 import re
 import mimetypes
 import json
-from datetime import datetime

 from loguru import logger
 from slugify import slugify

 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.utils import url as UrlUtil
+from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
 from auto_archiver.core.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor
@@ -38,7 +37,7 @@ class Twitter(GenericDropin):
        try:
            if not tweet.get("user") or not tweet.get("created_at"):
                raise ValueError("Error retreiving post. Are you sure it exists?")
-            timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
+            timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
        except (ValueError, KeyError) as ex:
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_extractor.py
@@ -2,7 +2,6 @@ import json
 import re
 import mimetypes
 import requests
-from datetime import datetime

 from loguru import logger
 from pytwitter import Api
@@ -10,6 +9,7 @@ from slugify import slugify

 from auto_archiver.core import Extractor
 from auto_archiver.core import Metadata, Media
+from auto_archiver.utils import get_datetime_from_str


 class TwitterApiExtractor(Extractor):
@@ -91,7 +91,7 @@ class TwitterApiExtractor(Extractor):

        result = Metadata()
        result.set_title(tweet.data.text)
-        result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
+        result.set_timestamp(get_datetime_from_str(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))

        urls = []
        if tweet.includes: