Merge branch 'main' into webdriver-cookies

2026-06-12 05:08:28 +03:00 · 2025-03-14 09:37:12 +00:00
parent f6b13327f0 5f7a8b1ac0
commit f504d2e304
6 changed files with 121 additions and 154 deletions
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -13,6 +13,8 @@ from loguru import logger
 from auto_archiver.core.extractor import Extractor
 from auto_archiver.core import Metadata, Media

+class SkipYtdlp(Exception):
+    pass
 class GenericExtractor(Extractor):
    _dropins = {}

@@ -269,7 +271,8 @@ class GenericExtractor(Extractor):

        try:
            if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
-                raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+                raise SkipYtdlp()

            # don't download since it can be a live stream
            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
@@ -283,15 +286,19 @@ class GenericExtractor(Extractor):
            if info_extractor.IE_NAME == "generic":
                # don't clutter the logs with issues about the 'generic' extractor not having a dropin
                return False
+            
+            if not isinstance(e, SkipYtdlp):
+                logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')

-            logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead')
            try:
                result = self.get_metadata_for_post(info_extractor, url, ydl)
            except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
-                logger.error(f'Error downloading metadata for post: {post_e}')
+                logger.error("Error downloading metadata for post: {error}", error=str(post_e))
                return False
            except Exception as generic_e:
-                logger.debug(f'Attempt to extract using ytdlp dropin "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
+                logger.debug('Attempt to extract using ytdlp extractor "{name}" failed:  \n  {error}',
+                             name=info_extractor.IE_NAME, error=str(generic_e),
+                            exc_info=True)
                return False
        
        if result:
--- a/src/auto_archiver/modules/generic_extractor/tiktok.py
+++ b/src/auto_archiver/modules/generic_extractor/tiktok.py
@@ -0,0 +1,74 @@
+import requests
+from loguru import logger
+from auto_archiver.core import Metadata, Media
+from datetime import datetime, timezone
+from .dropin import GenericDropin
+
+class Tiktok(GenericDropin):
+    """
+    TikTok droping for the Generic Extractor that uses an unofficial API if/when ytdlp fails.
+    It's useful for capturing content that requires a login, like sensitive content.
+    """
+
+    TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
+
+    def extract_post(self, url: str, ie_instance):
+
+        logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
+
+        endpoint = self.TIKWM_ENDPOINT.format(url=url)
+
+        r = requests.get(endpoint)
+        if r.status_code != 200:
+            raise ValueError(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
+
+        try:
+            json_response = r.json()
+        except ValueError:
+            raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
+
+        if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
+            raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
+
+        # tries to get the non-watermarked version first
+        video_url = api_data.pop("play", api_data.pop("wmplay", None))
+        if not video_url:
+            raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
+        
+        api_data['video_url'] = video_url
+        return api_data
+
+    
+    def create_metadata(self, post: dict, ie_instance, archiver, url):
+
+                # prepare result, start by downloading video
+        result = Metadata()
+        video_url = post.pop("video_url")
+
+        # get the cover if possible
+        cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
+        if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
+            result.add_media(Media(cover_downloaded))
+
+        # get the video or fail
+        video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
+        if not video_downloaded:
+            logger.error(f"failed to download video from {video_url}")
+            return False
+        video_media = Media(video_downloaded)
+        if duration := post.pop("duration", None):
+            video_media.set("duration", duration)
+        result.add_media(video_media)
+
+        # add remaining metadata
+        result.set_title(post.pop("title", ""))
+
+        if created_at := post.pop("create_time", None):
+            result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
+
+        if (author := post.pop("author", None)):
+            result.set("author", author)
+
+        result.set("api_data", post)
+
+        return result