Merge main

2026-06-11 20:58:29 +03:00 · 2025-03-17 10:05:11 +00:00
parent 7e360240bf b2238427a0
commit 59b910ec30
229 changed files with 61430 additions and 3147 deletions
--- a/src/auto_archiver/modules/generic_extractor/init.py
+++ b/src/auto_archiver/modules/generic_extractor/init.py
@@ -1 +1 @@
-from .generic_extractor import GenericExtractor
+from .generic_extractor import GenericExtractor
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -28,6 +28,13 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
+
+### Auto-Updates
+
+The Generic Extractor will also automatically check for updates to `yt-dlp` (every 5 days by default).
+This can be configured using the `ytdlp_update_interval` setting (or disabled by setting it to -1).
+If you are having issues with the extractor, you can review the version of `yt-dlp` being used with `yt-dlp --version`.
+
 """,
    "configs": {
        "subtitles": {"default": True, "help": "download subtitles if available", "type": "bool"},
@@ -64,5 +71,17 @@ via the command line using the `--dropins` option (TODO!).
            "default": "inf",
            "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
        },
+        "ytdlp_update_interval": {
+            "default": 5,
+            "help": "How often to check for yt-dlp updates (days). If positive, will check and update yt-dlp every [num] days. Set it to -1 to disable, or 0 to always update on every run.",
+            "type": "int",
+        },
+        "ytdlp_args": {
+            "default": "",
+            "help": "Additional arguments to pass to yt-dlp, e.g. --no-check-certificate or --plugin-dirs.\
+See yt-dlp documentation here for more information: https://github.com/yt-dlp/yt-dlp?tab=readme-ov-file#general-options\
+Note: this is not to be confused with 'extractor_args' which are specific to the extractor itself.",
+            "type": "str",
+        },
    },
 }
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -4,15 +4,16 @@ from auto_archiver.core.extractor import Extractor
 from auto_archiver.core.metadata import Metadata, Media
 from .dropin import GenericDropin, InfoExtractor

-class Bluesky(GenericDropin):

+class Bluesky(GenericDropin):
    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        result.set_url(url)
        result.set_title(post["record"]["text"])
        result.set_timestamp(post["record"]["createdAt"])
        for k, v in self._get_post_data(post).items():
-            if v: result.set(k, v)
+            if v:
+                result.set(k, v)

        # download if embeds present (1 video XOR >=1 images)
        for media in self._download_bsky_embeds(post, archiver):
@@ -23,12 +24,12 @@ class Bluesky(GenericDropin):

    def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
        # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
-        handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+        handle, video_id = ie_instance._match_valid_url(url).group("handle", "id")
        return ie_instance._extract_post(handle=handle, post_id=video_id)

    def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
        """
-        Iterates over image(s) or video in a Bluesky post and downloads them        
+        Iterates over image(s) or video in a Bluesky post and downloads them
        """
        media = []
        embed = post.get("record", {}).get("embed", {})
@@ -37,16 +38,15 @@ class Bluesky(GenericDropin):

        media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
        for image_media in image_medias:
-            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
+            url = media_url.format(image_media["image"]["ref"]["$link"], post["author"]["did"])
            image_media = archiver.download_from_url(url)
            media.append(Media(image_media))
        for video_media in video_medias:
-            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
+            url = media_url.format(video_media["ref"]["$link"], post["author"]["did"])
            video_media = archiver.download_from_url(url)
            media.append(Media(video_media))
        return media

-
    def _get_post_data(self, post: dict) -> dict:
        """
        Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
@@ -74,4 +74,4 @@ class Bluesky(GenericDropin):
            res["tags"] = tags
        if links:
            res["links"] = links
-        return res
+        return res
--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@@ -3,11 +3,12 @@ from yt_dlp.extractor.common import InfoExtractor
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.core.extractor import Extractor

+
 class GenericDropin:
    """Base class for dropins for the generic extractor.
-    
+
    In many instances, an extractor will exist in ytdlp, but it will only process videos.
-    Dropins can be created and used to make use of the already-written private code of a 
+    Dropins can be created and used to make use of the already-written private code of a
    specific extractor from ytdlp.

    The dropin should be able to handle the following methods:
@@ -31,21 +32,19 @@ class GenericDropin:
        This method should return the post data from the url.
        """
        raise NotImplementedError("This method should be implemented in the subclass")
-    

    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        This method should create a Metadata object from the post data.
        """
        raise NotImplementedError("This method should be implemented in the subclass")
-    

    def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
        """
        This method should return True if you want to skip the ytdlp download method.
        """
        return False
-    
+
    def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
        """
        This method should return a list of strings (keys) to clean from the video_data dict.
@@ -53,16 +52,16 @@ class GenericDropin:
        E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
        """
        return []
-    
+
    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
        """
        This method should download any additional media from the post.
        """
        return metadata
-    
+
    def is_suitable(self, url, info_extractor: InfoExtractor):
        """
        Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
        (based on being able to parse other URLs)
        """
-        return False
+        return False
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -1,7 +1,6 @@
 import re
 from .dropin import GenericDropin
 from auto_archiver.core.metadata import Metadata
-from auto_archiver.core.media import Media

 # TODO: Remove if / when  https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
 from yt_dlp.utils import (
@@ -12,77 +11,124 @@ from yt_dlp.utils import (
    merge_dicts,
    int_or_none,
    parse_count,
-
 )

+
 def _extract_metadata(self, webpage, video_id):
-    post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
-        r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
-    post = traverse_obj(post_data, (
-        ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
-    media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
-        k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
-    title = get_first(media, ('title', 'text'))
-    description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
-    page_title = title or self._html_search_regex((
-        r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
-        r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
-        self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'<title>(?P<content>.+?)</title>',
-    ), webpage, 'title', default=None, group='content')
+    post_data = [
+        self._parse_json(j, video_id, fatal=False)
+        for j in re.findall(r"data-sjs>({.*?ScheduledServerJS.*?})</script>", webpage)
+    ]
+    post = (
+        traverse_obj(
+            post_data,
+            (..., "require", ..., ..., ..., "__bbox", "require", ..., ..., ..., "__bbox", "result", "data"),
+            expected_type=dict,
+        )
+        or []
+    )
+    media = traverse_obj(
+        post,
+        (
+            ...,
+            "attachments",
+            ...,
+            lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
+        ),
+        expected_type=dict,
+    )
+    title = get_first(media, ("title", "text"))
+    description = get_first(media, ("creation_story", "comet_sections", "message", "story", "message", "text"))
+    page_title = title or self._html_search_regex(
+        (
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
+            r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
+            self._meta_regex("og:title"),
+            self._meta_regex("twitter:title"),
+            r"<title>(?P<content>.+?)</title>",
+        ),
+        webpage,
+        "title",
+        default=None,
+        group="content",
+    )
    description = description or self._html_search_meta(
-        ['description', 'og:description', 'twitter:description'],
-        webpage, 'description', default=None)
+        ["description", "og:description", "twitter:description"], webpage, "description", default=None
+    )
    uploader_data = (
-        get_first(media, ('owner', {dict}))
-        or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
-        or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
-        or get_first(post, ('node', 'actors', ..., {dict}))
-        or get_first(post, ('event', 'event_creator', {dict}))
-        or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {})
-    uploader = uploader_data.get('name') or (
-        clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
+        get_first(media, ("owner", {dict}))
+        or get_first(
+            post, ("video", "creation_story", "attachments", ..., "media", lambda k, v: k == "owner" and v["name"])
+        )
+        or get_first(post, (..., "video", lambda k, v: k == "owner" and v["name"]))
+        or get_first(post, ("node", "actors", ..., {dict}))
+        or get_first(post, ("event", "event_creator", {dict}))
+        or get_first(post, ("video", "creation_story", "short_form_video_context", "video_owner", {dict}))
+        or {}
+    )
+    uploader = uploader_data.get("name") or (
+        clean_html(get_element_by_id("fbPhotoPageAuthorName", webpage))
        or self._search_regex(
-            (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
-    timestamp = int_or_none(self._search_regex(
-        r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
-        'timestamp', default=None))
-    thumbnail = self._html_search_meta(
-        ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+            (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes("title")), webpage, "uploader", fatal=False
+        )
+    )
+    timestamp = int_or_none(self._search_regex(r'<abbr[^>]+data-utime=["\'](\d+)', webpage, "timestamp", default=None))
+    thumbnail = self._html_search_meta(["og:image", "twitter:image"], webpage, "thumbnail", default=None)
    # some webpages contain unretrievable thumbnail urls
    # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
    # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
-    if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
+    if thumbnail and not re.search(r"\.(?:jpg|png)", thumbnail):
        thumbnail = None
    info_dict = {
-        'description': description,
-        'uploader': uploader,
-        'uploader_id': uploader_data.get('id'),
-        'timestamp': timestamp,
-        'thumbnail': thumbnail,
-        'view_count': parse_count(self._search_regex(
-            (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
-            webpage, 'view count', default=None)),
-        'concurrent_view_count': get_first(post, (
-            ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
-        **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', {
-            'like_count': ('likers', 'count', {int}),
-            'comment_count': ('total_comment_count', {int}),
-            'repost_count': ('share_count_reduced', {parse_count}),
-        }), get_all=False),
+        "description": description,
+        "uploader": uploader,
+        "uploader_id": uploader_data.get("id"),
+        "timestamp": timestamp,
+        "thumbnail": thumbnail,
+        "view_count": parse_count(
+            self._search_regex(
+                (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
+                webpage,
+                "view count",
+                default=None,
+            )
+        ),
+        "concurrent_view_count": get_first(
+            post, (("video", (..., ..., "attachments", ..., "media")), "liveViewerCount", {int_or_none})
+        ),
+        **traverse_obj(
+            post,
+            (
+                lambda _, v: video_id in v["url"],
+                "feedback",
+                {
+                    "like_count": ("likers", "count", {int}),
+                    "comment_count": ("total_comment_count", {int}),
+                    "repost_count": ("share_count_reduced", {parse_count}),
+                },
+            ),
+            get_all=False,
+        ),
    }

    info_json_ld = self._search_json_ld(webpage, video_id, default={})
-    info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
-                                or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
+    info_json_ld["title"] = (
+        re.sub(r"\s*\|\s*Facebook$", "", title or info_json_ld.get("title") or page_title or "")
+        or (description or "").replace("\n", " ")
+        or f"Facebook video #{video_id}"
+    )
    return merge_dicts(info_json_ld, info_dict)
-class Facebook(GenericDropin):
-    
-    def extract_post(self, url: str, ie_instance):

-        post_id_regex = r'(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))'
-        post_id = re.search(post_id_regex, url).group('id')
-        webpage = ie_instance._download_webpage(
-            url.replace('://m.facebook.com/', '://www.facebook.com/'), post_id)
+
+class Facebook(GenericDropin):
+    def extract_post(self, url: str, ie_instance):
+        video_id = ie_instance._match_valid_url(url).group("id")
+        ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
+        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
+
+        post_id_regex = r"(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))"
+        post_id = re.search(post_id_regex, url).group("id")
+        webpage = ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), post_id)

        # TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...

@@ -93,20 +139,19 @@ class Facebook(GenericDropin):

    def create_metadata(self, post: dict, ie_instance, archiver, url):
        result = Metadata()
-        result.set_content(post.get('description', ''))
-        result.set_title(post.get('title', ''))
-        result.set('author', post.get('uploader', ''))
+        result.set_content(post.get("description", ""))
+        result.set_title(post.get("title", ""))
+        result.set("author", post.get("uploader", ""))
        result.set_url(url)
        return result
-    
+
    def is_suitable(self, url, info_extractor):
-        regex = r'(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)'
+        regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
        return re.match(regex, url)
-    
+
    def skip_ytdlp_download(self, url: str, ie_instance):
        """
        Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
        """
-        if re.search(r'/t.\d+/\d+', url):
+        if re.search(r"/t.\d+/\d+", url):
            return True
-
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -1,18 +1,68 @@
-import datetime, os, yt_dlp, pysubs2
+import datetime
+import os
 import importlib
+import subprocess
+
 from typing import Generator, Type
+
+import yt_dlp
 from yt_dlp.extractor.common import InfoExtractor
+import pysubs2

 from loguru import logger

 from auto_archiver.core.extractor import Extractor
 from auto_archiver.core import Metadata, Media

-class Skip(Exception):
+
+class SkipYtdlp(Exception):
    pass
+
+
 class GenericExtractor(Extractor):
    _dropins = {}

+    def setup(self):
+        # check for file .ytdlp-update in the secrets folder
+        if self.ytdlp_update_interval < 0:
+            return
+
+        use_secrets = os.path.exists("secrets")
+        path = os.path.join("secrets" if use_secrets else "", ".ytdlp-update")
+        next_update_check = None
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                next_update_check = datetime.datetime.fromisoformat(f.read())
+
+        if not next_update_check or next_update_check < datetime.datetime.now():
+            self.update_ytdlp()
+
+            next_update_check = datetime.datetime.now() + datetime.timedelta(days=self.ytdlp_update_interval)
+            with open(path, "w") as f:
+                f.write(next_update_check.isoformat())
+
+    def update_ytdlp(self):
+        logger.info("Checking and updating yt-dlp...")
+        logger.info(
+            f"Tip: change the 'ytdlp_update_interval' setting to control how often yt-dlp is updated. Set to -1 to disable or 0 to enable on every run. Current setting: {self.ytdlp_update_interval}"
+        )
+        from importlib.metadata import version as get_version
+
+        old_version = get_version("yt-dlp")
+        try:
+            # try and update with pip (this works inside poetry environment and in a normal virtualenv)
+            result = subprocess.run(["pip", "install", "--upgrade", "yt-dlp"], check=True, capture_output=True)
+
+            if "Successfully installed yt-dlp" in result.stdout.decode():
+                new_version = importlib.metadata.version("yt-dlp")
+                logger.info(f"yt-dlp successfully (from {old_version} to {new_version})")
+                importlib.reload(yt_dlp)
+            else:
+                logger.info("yt-dlp already up to date")
+
+        except Exception as e:
+            logger.error(f"Error updating yt-dlp: {e}")
+
    def suitable_extractors(self, url: str) -> Generator[str, None, None]:
        """
        Returns a list of valid extractors for the given URL"""
@@ -29,17 +79,17 @@ class GenericExtractor(Extractor):
            if info_extractor.suitable(url):
                yield info_extractor
                continue
-            

-        
    def suitable(self, url: str) -> bool:
        """
        Checks for valid URLs out of all ytdlp extractors.
        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
        """
        return any(self.suitable_extractors(url))
-    
-    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
+
+    def download_additional_media(
+        self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata
+    ) -> Metadata:
        """
        Downloads additional media like images, comments, subtitles, etc.

@@ -48,7 +98,7 @@ class GenericExtractor(Extractor):

        # Just get the main thumbnail. More thumbnails are available in
        # video_data['thumbnails'] should they be required
-        thumbnail_url = video_data.get('thumbnail')
+        thumbnail_url = video_data.get("thumbnail")
        if thumbnail_url:
            try:
                cover_image_path = self.download_from_url(thumbnail_url)
@@ -71,15 +121,65 @@ class GenericExtractor(Extractor):
        Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
        """

-        base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
-                     'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
-                     'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
-                     'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
-                     'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
-                     'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
-                     '_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
-                     'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
-        
+        base_keys = [
+            "formats",
+            "thumbnail",
+            "display_id",
+            "epoch",
+            "requested_downloads",
+            "duration_string",
+            "thumbnails",
+            "http_headers",
+            "webpage_url_basename",
+            "webpage_url_domain",
+            "extractor",
+            "extractor_key",
+            "playlist",
+            "playlist_index",
+            "duration_string",
+            "protocol",
+            "requested_subtitles",
+            "format_id",
+            "acodec",
+            "vcodec",
+            "ext",
+            "epoch",
+            "_has_drm",
+            "filesize",
+            "audio_ext",
+            "video_ext",
+            "vbr",
+            "abr",
+            "resolution",
+            "dynamic_range",
+            "aspect_ratio",
+            "cookies",
+            "format",
+            "quality",
+            "preference",
+            "artists",
+            "channel_id",
+            "subtitles",
+            "tbr",
+            "url",
+            "original_url",
+            "automatic_captions",
+            "playable_in_embed",
+            "live_status",
+            "_format_sort_fields",
+            "chapters",
+            "requested_formats",
+            "format_note",
+            "audio_channels",
+            "asr",
+            "fps",
+            "was_live",
+            "is_live",
+            "heatmap",
+            "age_limit",
+            "stretched_ratio",
+        ]
+
        dropin = self.dropin_for_name(info_extractor.ie_key())
        if dropin:
            try:
@@ -88,8 +188,8 @@ class GenericExtractor(Extractor):
                pass

        return base_keys
-    
-    def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
+
+    def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url: str, result: Metadata) -> Metadata:
        """
        Creates a Metadata object from the given video_data
        """
@@ -98,29 +198,36 @@ class GenericExtractor(Extractor):
        result = self.download_additional_media(video_data, info_extractor, result)

        # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
-        result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
+        result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
        result.set_url(url)
-
+        if "description" in video_data:
+            result.set_content(video_data["description"])
        # extract comments if enabled
        if self.comments:
-            result.set("comments", [{
-                "text": c["text"],
-                "author": c["author"], 
-                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
-            } for c in video_data.get("comments", [])])
+            result.set(
+                "comments",
+                [
+                    {
+                        "text": c["text"],
+                        "author": c["author"],
+                        "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz=datetime.timezone.utc),
+                    }
+                    for c in video_data.get("comments", [])
+                ],
+            )

        # then add the common metadata
        if timestamp := video_data.pop("timestamp", None):
-            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if upload_date := video_data.pop("upload_date", None):
-            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
+            upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
            result.set("upload_date", upload_date)
-        
+
        # then clean away any keys we don't want
        for clean_key in self.keys_to_clean(info_extractor, video_data):
            video_data.pop(clean_key, None)
-        
+
        # then add the rest of the video data
        for k, v in video_data.items():
            if v:
@@ -138,26 +245,28 @@ class GenericExtractor(Extractor):

        if not dropin:
            # TODO: add a proper link to 'how to create your own dropin'
-            logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
+            logger.debug(f"""Could not find valid dropin for {info_extractor.ie_key()}.
                     Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
            return False
-        
+
        post_data = dropin.extract_post(url, ie_instance)
        result = dropin.create_metadata(post_data, ie_instance, self, url)
        return self.add_metadata(post_data, info_extractor, url, result)

-    def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
-
+    def get_metadata_for_video(
+        self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
+    ) -> Metadata:
        # this time download
-        ydl.params['getcomments'] = self.comments
-        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
+        ydl.params["getcomments"] = self.comments
+        # TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
        data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
        if "entries" in data:
            entries = data.get("entries", [])
            if not len(entries):
-                logger.warning('YoutubeDLArchiver could not find any video')
+                logger.warning("YoutubeDLArchiver could not find any video")
                return False
-        else: entries = [data]
+        else:
+            entries = [data]

        result = Metadata()

@@ -165,17 +274,18 @@ class GenericExtractor(Extractor):
            try:
                filename = ydl.prepare_filename(entry)
                if not os.path.exists(filename):
-                    filename = filename.split('.')[0] + '.mkv'
+                    filename = filename.split(".")[0] + ".mkv"

                new_media = Media(filename)
                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
-                    if x in entry: new_media.set(x, entry[x])
+                    if x in entry:
+                        new_media.set(x, entry[x])

                # read text from subtitles if enabled
                if self.subtitles:
-                    for lang, val in (data.get('requested_subtitles') or {}).items():
-                        try:    
-                            subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
+                    for lang, val in (data.get("requested_subtitles") or {}).items():
+                        try:
+                            subs = pysubs2.load(val.get("filepath"), encoding="utf-8")
                            text = " ".join([line.text for line in subs])
                            new_media.set(f"subtitles_{lang}", text)
                        except Exception as e:
@@ -185,8 +295,8 @@ class GenericExtractor(Extractor):
                logger.error(f"Error processing entry {entry}: {e}")

        return self.add_metadata(data, info_extractor, url, result)
-    
-    def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
+
+    def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
        dropin_name = dropin_name.lower()

        if dropin_name == "generic":
@@ -194,6 +304,7 @@ class GenericExtractor(Extractor):
            return None

        dropin_class_name = dropin_name.title()
+
        def _load_dropin(dropin):
            dropin_class = getattr(dropin, dropin_class_name)()
            dropin.extractor = self
@@ -218,7 +329,7 @@ class GenericExtractor(Extractor):
                return _load_dropin(dropin)
            except (FileNotFoundError, ModuleNotFoundError):
                pass
-        
+
        # fallback to loading the dropins within auto-archiver
        try:
            return _load_dropin(importlib.import_module(f".{dropin_name}", package=package))
@@ -230,46 +341,53 @@ class GenericExtractor(Extractor):
    def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
        """
        Tries to download the given url using the specified extractor
-        
+
        It first tries to use ytdlp directly to download the video. If the post is not a video, it will then try to
        use the extractor's _extract_post method to get the post metadata if possible.
        """
        # when getting info without download, we also don't need the comments
-        ydl.params['getcomments'] = False
+        ydl.params["getcomments"] = False
        result = False

        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())

        try:
-            if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
-                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()} (dropin override)")
-                raise Skip()
+            if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
+                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+                raise SkipYtdlp()

            # don't download since it can be a live stream
            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
-            if data.get('is_live', False) and not self.livestreams:
+            if data.get("is_live", False) and not self.livestreams:
                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
                return False
            # it's a valid video, that the youtubdedl can download out of the box
            result = self.get_metadata_for_video(data, info_extractor, url, ydl)

        except Exception as e:
-            if info_extractor.ie_key() == "generic":
+            if info_extractor.IE_NAME == "generic":
                # don't clutter the logs with issues about the 'generic' extractor not having a dropin
                return False
-            
-            if not isinstance(e, Skip):
-                logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead')
+
+            if not isinstance(e, SkipYtdlp):
+                logger.debug(
+                    f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
+                )

            try:
                result = self.get_metadata_for_post(info_extractor, url, ydl)
            except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
-                logger.error(f'Error downloading metadata for post: {post_e}')
+                logger.error("Error downloading metadata for post: {error}", error=str(post_e))
                return False
            except Exception as generic_e:
-                logger.debug(f'Attempt to extract using ytdlp dropin for "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
+                logger.debug(
+                    'Attempt to extract using ytdlp extractor "{name}" failed:  \n  {error}',
+                    name=info_extractor.IE_NAME,
+                    error=str(generic_e),
+                    exc_info=True,
+                )
                return False
-        
+
        if result:
            extractor_name = "yt-dlp"
            if info_extractor:
@@ -285,42 +403,56 @@ class GenericExtractor(Extractor):
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()

-        #TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
+        # TODO: this is a temporary hack until this issue is closed: https://github.com/yt-dlp/yt-dlp/issues/11025
        if url.startswith("https://ya.ru"):
            url = url.replace("https://ya.ru", "https://yandex.ru")
            item.set("replaced_url", url)

+        ydl_options = [
+            "-o",
+            os.path.join(self.tmp_dir, "%(id)s.%(ext)s"),
+            "--quiet",
+            "--no-playlist" if not self.allow_playlist else "--yes-playlist",
+            "--write-subs" if self.subtitles else "--no-write-subs",
+            "--write-auto-subs" if self.subtitles else "--no-write-auto-subs",
+            "--live-from-start" if self.live_from_start else "--no-live-from-start",
+            "--proxy",
+            self.proxy if self.proxy else "",
+            f"--max-downloads {self.max_downloads}" if self.max_downloads != "inf" else "",
+            f"--playlist-end {self.max_downloads}" if self.max_downloads != "inf" else "",
+        ]

-        ydl_options = {'outtmpl': os.path.join(self.tmp_dir, f'%(id)s.%(ext)s'), 
-                       'quiet': False, 'noplaylist': not self.allow_playlist ,
-                       'writesubtitles': self.subtitles,'writeautomaticsub': self.subtitles,
-                       "live_from_start": self.live_from_start, "proxy": self.proxy,
-                       "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
-        
        # set up auth
        auth = self.auth_for_site(url, extract_cookies=False)
+
        # order of importance: username/pasword -> api_key -> cookie -> cookies_from_browser -> cookies_file
        if auth:
-            if 'username' in auth and 'password' in auth:
-                logger.debug(f'Using provided auth username and password for {url}')
-                ydl_options['username'] = auth['username']
-                ydl_options['password'] = auth['password']
-            elif 'cookie' in auth:
-                logger.debug(f'Using provided auth cookie for {url}')
-                yt_dlp.utils.std_headers['cookie'] = auth['cookie']
-            elif 'cookies_from_browser' in auth:
-                logger.debug(f'Using extracted cookies from browser {self.cookies_from_browser} for {url}')
-                ydl_options['cookiesfrombrowser'] = auth['cookies_from_browser']
-            elif 'cookies_file' in auth:
-                logger.debug(f'Using cookies from file {self.cookie_file} for {url}')
-                ydl_options['cookiesfile'] = auth['cookies_file']
+            if "username" in auth and "password" in auth:
+                logger.debug(f"Using provided auth username and password for {url}")
+                ydl_options.extend(("--username", auth["username"]))
+                ydl_options.extend(("--password", auth["password"]))
+            elif "cookie" in auth:
+                logger.debug(f"Using provided auth cookie for {url}")
+                yt_dlp.utils.std_headers["cookie"] = auth["cookie"]
+            elif "cookies_from_browser" in auth:
+                logger.debug(f"Using extracted cookies from browser {auth['cookies_from_browser']} for {url}")
+                ydl_options.extend(("--cookies-from-browser", auth["cookies_from_browser"]))
+            elif "cookies_file" in auth:
+                logger.debug(f"Using cookies from file {auth['cookies_file']} for {url}")
+                ydl_options.extend(("--cookies", auth["cookies_file"]))

-        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
+        if self.ytdlp_args:
+            logger.debug("Adding additional ytdlp arguments: {self.ytdlp_args}")
+            ydl_options += self.ytdlp_args.split(" ")
+
+        *_, validated_options = yt_dlp.parse_options(ydl_options)
+        ydl = yt_dlp.YoutubeDL(
+            validated_options
+        )  # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

        for info_extractor in self.suitable_extractors(url):
            result = self.download_for_extractor(info_extractor, url, ydl)
            if result:
                return result

-
        return False
--- a/src/auto_archiver/modules/generic_extractor/tiktok.py
+++ b/src/auto_archiver/modules/generic_extractor/tiktok.py
@@ -0,0 +1,72 @@
+import requests
+from loguru import logger
+from auto_archiver.core import Metadata, Media
+from datetime import datetime, timezone
+from .dropin import GenericDropin
+
+
+class Tiktok(GenericDropin):
+    """
+    TikTok droping for the Generic Extractor that uses an unofficial API if/when ytdlp fails.
+    It's useful for capturing content that requires a login, like sensitive content.
+    """
+
+    TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
+
+    def extract_post(self, url: str, ie_instance):
+        logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
+
+        endpoint = self.TIKWM_ENDPOINT.format(url=url)
+
+        r = requests.get(endpoint)
+        if r.status_code != 200:
+            raise ValueError(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
+
+        try:
+            json_response = r.json()
+        except ValueError:
+            raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
+
+        if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
+            raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
+
+        # tries to get the non-watermarked version first
+        video_url = api_data.pop("play", api_data.pop("wmplay", None))
+        if not video_url:
+            raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
+
+        api_data["video_url"] = video_url
+        return api_data
+
+    def create_metadata(self, post: dict, ie_instance, archiver, url):
+        # prepare result, start by downloading video
+        result = Metadata()
+        video_url = post.pop("video_url")
+
+        # get the cover if possible
+        cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
+        if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
+            result.add_media(Media(cover_downloaded))
+
+        # get the video or fail
+        video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
+        if not video_downloaded:
+            logger.error(f"failed to download video from {video_url}")
+            return False
+        video_media = Media(video_downloaded)
+        if duration := post.pop("duration", None):
+            video_media.set("duration", duration)
+        result.add_media(video_media)
+
+        # add remaining metadata
+        result.set_title(post.pop("title", ""))
+
+        if created_at := post.pop("create_time", None):
+            result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
+
+        if author := post.pop("author", None):
+            result.set("author", author)
+
+        result.set("api_data", post)
+
+        return result
--- a/src/auto_archiver/modules/generic_extractor/truth.py
+++ b/src/auto_archiver/modules/generic_extractor/truth.py
@@ -9,11 +9,11 @@ from dateutil.parser import parse as parse_dt

 from .dropin import GenericDropin

-class Truth(GenericDropin):

+class Truth(GenericDropin):
    def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
        video_id = ie_instance._match_id(url)
-        truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
+        truthsocial_url = f"https://truthsocial.com/api/v1/statuses/{video_id}"
        return ie_instance._download_json(truthsocial_url, video_id)

    def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
@@ -22,31 +22,42 @@ class Truth(GenericDropin):
    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        Creates metadata from a truth social post
-        
+
        Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
-        
+
        Format is:
-        
+
        {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
        """

        result = Metadata()
        result.set_url(url)
-        timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
+        timestamp = post["created_at"]  # format is 2022-12-29T19:51:18.161Z
        result.set_timestamp(parse_dt(timestamp))
-        result.set('description', post['content'])
-        result.set('author', post['account']['username'])
+        result.set("description", post["content"])
+        result.set("author", post["account"]["username"])

-        for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
+        for key in [
+            "replies_count",
+            "reblogs_count",
+            "favourites_count",
+            ("account", "followers_count"),
+            ("account", "following_count"),
+            ("account", "statuses_count"),
+            ("account", "display_name"),
+            "language",
+            "in_reply_to_account",
+            "replies_count",
+        ]:
            if isinstance(key, tuple):
                store_key = " ".join(key)
            else:
                store_key = key
            result.set(store_key, traverse_obj(post, key))
-        
-        # add the media
-        for media in post.get('media_attachments', []):
-            filename = archiver.download_from_url(media['url'])
-            result.add_media(Media(filename), id=media.get('id'))

-        return result
+        # add the media
+        for media in post.get("media_attachments", []):
+            filename = archiver.download_from_url(media["url"])
+            result.add_media(Media(filename), id=media.get("id"))
+
+        return result
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -1,4 +1,6 @@
-import re, mimetypes, json
+import re
+import mimetypes
+import json
 from datetime import datetime

 from loguru import logger
@@ -10,9 +12,8 @@ from auto_archiver.core.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor

+
 class Twitter(GenericDropin):
-
-
    def choose_variant(self, variants):
        # choosing the highest quality possible
        variant, width, height = None, 0, 0
@@ -27,44 +28,43 @@ class Twitter(GenericDropin):
            else:
                variant = var if not variant else variant
        return variant
-    
+
    def extract_post(self, url: str, ie_instance: InfoExtractor):
-        twid = ie_instance._match_valid_url(url).group('id')
+        twid = ie_instance._match_valid_url(url).group("id")
        return ie_instance._extract_status(twid=twid)

    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
            if not tweet.get("user") or not tweet.get("created_at"):
-                raise ValueError(f"Error retreiving post. Are you sure it exists?")
+                raise ValueError("Error retreiving post. Are you sure it exists?")
            timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
        except (ValueError, KeyError) as ex:
            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
            return False
-                
-        result\
-            .set_title(tweet.get('full_text', ''))\
-            .set_content(json.dumps(tweet, ensure_ascii=False))\
-            .set_timestamp(timestamp)
+
+        result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
+            timestamp
+        )
        if not tweet.get("entities", {}).get("media"):
-            logger.debug('No media found, archiving tweet text only')
+            logger.debug("No media found, archiving tweet text only")
            result.status = "twitter-ytdl"
            return result
        for i, tw_media in enumerate(tweet["entities"]["media"]):
            media = Media(filename="")
            mimetype = ""
            if tw_media["type"] == "photo":
-                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media["media_url_https"]))
                mimetype = "image/jpeg"
            elif tw_media["type"] == "video":
-                variant = self.choose_variant(tw_media['video_info']['variants'])
-                media.set("src", variant['url'])
-                mimetype = variant['content_type']
+                variant = self.choose_variant(tw_media["video_info"]["variants"])
+                media.set("src", variant["url"])
+                mimetype = variant["content_type"]
            elif tw_media["type"] == "animated_gif":
-                variant = tw_media['video_info']['variants'][0]
-                media.set("src", variant['url'])
-                mimetype = variant['content_type']
+                variant = tw_media["video_info"]["variants"][0]
+                media.set("src", variant["url"])
+                mimetype = variant["content_type"]
            ext = mimetypes.guess_extension(mimetype)
-            media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
+            media.filename = archiver.download_from_url(media.get("src"), f"{slugify(url)}_{i}{ext}")
            result.add_media(media)
-        return result
+        return result