Copy ytdlp code into AA project - seems like ytdlp won't be merged anytime soon

2026-06-08 03:18:28 +03:00 · 2025-03-17 09:57:05 +00:00
parent f8e846d59a
commit 7e360240bf
1 changed files with 79 additions and 3 deletions
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -3,17 +3,92 @@ from .dropin import GenericDropin
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.core.media import Media

+# TODO: Remove if / when  https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
+from yt_dlp.utils import (
+    clean_html,
+    get_element_by_id,
+    traverse_obj,
+    get_first,
+    merge_dicts,
+    int_or_none,
+    parse_count,
+
+)
+
+def _extract_metadata(self, webpage, video_id):
+    post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall(
+        r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage)]
+    post = traverse_obj(post_data, (
+        ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or []
+    media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: (
+        k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict)
+    title = get_first(media, ('title', 'text'))
+    description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text'))
+    page_title = title or self._html_search_regex((
+        r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
+        r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
+        self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'<title>(?P<content>.+?)</title>',
+    ), webpage, 'title', default=None, group='content')
+    description = description or self._html_search_meta(
+        ['description', 'og:description', 'twitter:description'],
+        webpage, 'description', default=None)
+    uploader_data = (
+        get_first(media, ('owner', {dict}))
+        or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name']))
+        or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name']))
+        or get_first(post, ('node', 'actors', ..., {dict}))
+        or get_first(post, ('event', 'event_creator', {dict}))
+        or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {})
+    uploader = uploader_data.get('name') or (
+        clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
+        or self._search_regex(
+            (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False))
+    timestamp = int_or_none(self._search_regex(
+        r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
+        'timestamp', default=None))
+    thumbnail = self._html_search_meta(
+        ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+    # some webpages contain unretrievable thumbnail urls
+    # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
+    # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
+    if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail):
+        thumbnail = None
+    info_dict = {
+        'description': description,
+        'uploader': uploader,
+        'uploader_id': uploader_data.get('id'),
+        'timestamp': timestamp,
+        'thumbnail': thumbnail,
+        'view_count': parse_count(self._search_regex(
+            (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
+            webpage, 'view count', default=None)),
+        'concurrent_view_count': get_first(post, (
+            ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})),
+        **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', {
+            'like_count': ('likers', 'count', {int}),
+            'comment_count': ('total_comment_count', {int}),
+            'repost_count': ('share_count_reduced', {parse_count}),
+        }), get_all=False),
+    }
+
+    info_json_ld = self._search_json_ld(webpage, video_id, default={})
+    info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '')
+                                or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}')
+    return merge_dicts(info_json_ld, info_dict)
 class Facebook(GenericDropin):
    
    def extract_post(self, url: str, ie_instance):
+
        post_id_regex = r'(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))'
        post_id = re.search(post_id_regex, url).group('id')
        webpage = ie_instance._download_webpage(
            url.replace('://m.facebook.com/', '://www.facebook.com/'), post_id)

-        # WARN: Will only work once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
        # TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...
-        post_data = ie_instance._extract_metadata(webpage, post_id)
+
+        # TODO: If/when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged, uncomment next line and delete the one after
+        # post_data = ie_instance._extract_metadata(webpage, post_id)
+        post_data = _extract_metadata(ie_instance, webpage, post_id)
        return post_data

    def create_metadata(self, post: dict, ie_instance, archiver, url):
@@ -33,4 +108,5 @@ class Facebook(GenericDropin):
        Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
        """
        if re.search(r'/t.\d+/\d+', url):
-            return True
+            return True
+