diff --git a/src/auto_archiver/modules/generic_extractor/facebook.py b/src/auto_archiver/modules/generic_extractor/facebook.py index d41c484..a487370 100644 --- a/src/auto_archiver/modules/generic_extractor/facebook.py +++ b/src/auto_archiver/modules/generic_extractor/facebook.py @@ -3,17 +3,92 @@ from .dropin import GenericDropin from auto_archiver.core.metadata import Metadata from auto_archiver.core.media import Media +# TODO: Remove if / when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged +from yt_dlp.utils import ( + clean_html, + get_element_by_id, + traverse_obj, + get_first, + merge_dicts, + int_or_none, + parse_count, + +) + +def _extract_metadata(self, webpage, video_id): + post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] + post = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) + title = get_first(media, ('title', 'text')) + description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) + page_title = title or self._html_search_regex(( + r']*class="uiHeaderTitle"[^>]*>(?P[^<]*)', + r'(?s)(?P.*?)', + self._meta_regex('og:title'), self._meta_regex('twitter:title'), r'(?P<content>.+?)', + ), webpage, 'title', default=None, group='content') + description = description or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None) + uploader_data = ( + get_first(media, ('owner', {dict})) + or get_first(post, ('video', 'creation_story', 'attachments', ..., 'media', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, (..., 'video', lambda k, v: k == 'owner' and v['name'])) + or get_first(post, ('node', 'actors', ..., {dict})) + or get_first(post, ('event', 'event_creator', {dict})) + or get_first(post, ('video', 'creation_story', 'short_form_video_context', 'video_owner', {dict})) or {}) + uploader = uploader_data.get('name') or ( + clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + or self._search_regex( + (r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes('title')), webpage, 'uploader', fatal=False)) + timestamp = int_or_none(self._search_regex( + r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + # some webpages contain unretrievable thumbnail urls + # like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1 + # in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/ + if thumbnail and not re.search(r'\.(?:jpg|png)', thumbnail): + thumbnail = None + info_dict = { + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_data.get('id'), + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'view_count': parse_count(self._search_regex( + (r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'), + webpage, 'view count', default=None)), + 'concurrent_view_count': get_first(post, ( + ('video', (..., ..., 'attachments', ..., 'media')), 'liveViewerCount', {int_or_none})), + **traverse_obj(post, (lambda _, v: video_id in v['url'], 'feedback', { + 'like_count': ('likers', 'count', {int}), + 'comment_count': ('total_comment_count', {int}), + 'repost_count': ('share_count_reduced', {parse_count}), + }), get_all=False), + } + + info_json_ld = self._search_json_ld(webpage, video_id, default={}) + info_json_ld['title'] = (re.sub(r'\s*\|\s*Facebook$', '', title or info_json_ld.get('title') or page_title or '') + or (description or '').replace('\n', ' ') or f'Facebook video #{video_id}') + return merge_dicts(info_json_ld, info_dict) class Facebook(GenericDropin): def extract_post(self, url: str, ie_instance): + post_id_regex = r'(?Ppfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))' post_id = re.search(post_id_regex, url).group('id') webpage = ie_instance._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), post_id) - # WARN: Will only work once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged # TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ... - post_data = ie_instance._extract_metadata(webpage, post_id) + + # TODO: If/when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged, uncomment next line and delete the one after + # post_data = ie_instance._extract_metadata(webpage, post_id) + post_data = _extract_metadata(ie_instance, webpage, post_id) return post_data def create_metadata(self, post: dict, ie_instance, archiver, url): @@ -33,4 +108,5 @@ class Facebook(GenericDropin): Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX """ if re.search(r'/t.\d+/\d+', url): - return True \ No newline at end of file + return True +