Merge pull request #223 from bellingcat/facebook_extractor

Create facebook dropin - working for images + text.
This commit is contained in:
Patrick Robertson
2025-03-17 12:45:05 +00:00
committed by GitHub
6 changed files with 236 additions and 25 deletions

View File

@@ -1,3 +1,4 @@
from typing import Type
from yt_dlp.extractor.common import InfoExtractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.extractor import Extractor
@@ -24,6 +25,8 @@ class GenericDropin:
"""
extractor: Type[Extractor] = None
def extract_post(self, url: str, ie_instance: InfoExtractor):
"""
This method should return the post data from the url.
@@ -55,3 +58,10 @@ class GenericDropin:
This method should download any additional media from the post.
"""
return metadata
def is_suitable(self, url, info_extractor: InfoExtractor):
"""
Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
(based on being able to parse other URLs)
"""
return False

View File

@@ -1,17 +1,154 @@
import re
from .dropin import GenericDropin
from auto_archiver.core.metadata import Metadata
from yt_dlp.extractor.facebook import FacebookIE
# TODO: Remove if / when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
from yt_dlp.utils import (
clean_html,
get_element_by_id,
traverse_obj,
get_first,
merge_dicts,
int_or_none,
parse_count,
)
def _extract_metadata(self, webpage, video_id):
post_data = [
self._parse_json(j, video_id, fatal=False)
for j in re.findall(r"data-sjs>({.*?ScheduledServerJS.*?})</script>", webpage)
]
post = (
traverse_obj(
post_data,
(..., "require", ..., ..., ..., "__bbox", "require", ..., ..., ..., "__bbox", "result", "data"),
expected_type=dict,
)
or []
)
media = traverse_obj(
post,
(
...,
"attachments",
...,
lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
),
expected_type=dict,
)
title = get_first(media, ("title", "text"))
description = get_first(media, ("creation_story", "comet_sections", "message", "story", "message", "text"))
page_title = title or self._html_search_regex(
(
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
self._meta_regex("og:title"),
self._meta_regex("twitter:title"),
r"<title>(?P<content>.+?)</title>",
),
webpage,
"title",
default=None,
group="content",
)
description = description or self._html_search_meta(
["description", "og:description", "twitter:description"], webpage, "description", default=None
)
uploader_data = (
get_first(media, ("owner", {dict}))
or get_first(
post, ("video", "creation_story", "attachments", ..., "media", lambda k, v: k == "owner" and v["name"])
)
or get_first(post, (..., "video", lambda k, v: k == "owner" and v["name"]))
or get_first(post, ("node", "actors", ..., {dict}))
or get_first(post, ("event", "event_creator", {dict}))
or get_first(post, ("video", "creation_story", "short_form_video_context", "video_owner", {dict}))
or {}
)
uploader = uploader_data.get("name") or (
clean_html(get_element_by_id("fbPhotoPageAuthorName", webpage))
or self._search_regex(
(r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes("title")), webpage, "uploader", fatal=False
)
)
timestamp = int_or_none(self._search_regex(r'<abbr[^>]+data-utime=["\'](\d+)', webpage, "timestamp", default=None))
thumbnail = self._html_search_meta(["og:image", "twitter:image"], webpage, "thumbnail", default=None)
# some webpages contain unretrievable thumbnail urls
# like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
if thumbnail and not re.search(r"\.(?:jpg|png)", thumbnail):
thumbnail = None
info_dict = {
"description": description,
"uploader": uploader,
"uploader_id": uploader_data.get("id"),
"timestamp": timestamp,
"thumbnail": thumbnail,
"view_count": parse_count(
self._search_regex(
(r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
webpage,
"view count",
default=None,
)
),
"concurrent_view_count": get_first(
post, (("video", (..., ..., "attachments", ..., "media")), "liveViewerCount", {int_or_none})
),
**traverse_obj(
post,
(
lambda _, v: video_id in v["url"],
"feedback",
{
"like_count": ("likers", "count", {int}),
"comment_count": ("total_comment_count", {int}),
"repost_count": ("share_count_reduced", {parse_count}),
},
),
get_all=False,
),
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
info_json_ld["title"] = (
re.sub(r"\s*\|\s*Facebook$", "", title or info_json_ld.get("title") or page_title or "")
or (description or "").replace("\n", " ")
or f"Facebook video #{video_id}"
)
return merge_dicts(info_json_ld, info_dict)
class Facebook(GenericDropin):
def extract_post(self, url: str, ie_instance):
video_id = ie_instance._match_valid_url(url).group("id")
ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
def extract_post(self, url: str, ie_instance: FacebookIE):
post_id_regex = r"(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))"
post_id = re.search(post_id_regex, url).group("id")
webpage = ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), post_id)
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
post_data = ie_instance._extract_metadata(webpage)
# TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...
# TODO: If/when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged, uncomment next line and delete the one after
# post_data = ie_instance._extract_metadata(webpage, post_id)
post_data = _extract_metadata(ie_instance, webpage, post_id)
return post_data
def create_metadata(self, post: dict, ie_instance, archiver, url):
metadata = archiver.create_metadata(url)
metadata.set_title(post.get("title")).set_content(post.get("description")).set_post_data(post)
return metadata
def create_metadata(self, post: dict, ie_instance: FacebookIE, archiver, url):
result = Metadata()
result.set_content(post.get("description", ""))
result.set_title(post.get("title", ""))
result.set("author", post.get("uploader", ""))
result.set_url(url)
return result
def is_suitable(self, url, info_extractor: FacebookIE):
regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
return re.match(regex, url)
def skip_ytdlp_download(self, url: str, is_instance: FacebookIE):
"""
Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
"""
if re.search(r"/t.\d+/\d+", url):
return True

View File

@@ -67,8 +67,18 @@ class GenericExtractor(Extractor):
"""
Returns a list of valid extractors for the given URL"""
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
if info_extractor.suitable(url) and info_extractor.working():
if not info_extractor.working():
continue
# check if there's a dropin and see if that declares whether it's suitable
dropin = self.dropin_for_name(info_extractor.ie_key())
if dropin and dropin.is_suitable(url, info_extractor):
yield info_extractor
continue
if info_extractor.suitable(url):
yield info_extractor
continue
def suitable(self, url: str) -> bool:
"""
@@ -188,9 +198,13 @@ class GenericExtractor(Extractor):
result = self.download_additional_media(video_data, info_extractor, result)
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
result.set_url(url)
if "description" in video_data:
if not result.get_title():
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
if not result.get("url"):
result.set_url(url)
if "description" in video_data and not result.get_content():
result.set_content(video_data["description"])
# extract comments if enabled
if self.comments:
@@ -207,10 +221,10 @@ class GenericExtractor(Extractor):
)
# then add the common metadata
if timestamp := video_data.pop("timestamp", None):
if timestamp := video_data.pop("timestamp", None) and not result.get("timestamp"):
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
result.set_timestamp(timestamp)
if upload_date := video_data.pop("upload_date", None):
if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"):
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
result.set("upload_date", upload_date)
@@ -240,7 +254,8 @@ class GenericExtractor(Extractor):
return False
post_data = dropin.extract_post(url, ie_instance)
return dropin.create_metadata(post_data, ie_instance, self, url)
result = dropin.create_metadata(post_data, ie_instance, self, url)
return self.add_metadata(post_data, info_extractor, url, result)
def get_metadata_for_video(
self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
@@ -296,6 +311,7 @@ class GenericExtractor(Extractor):
def _load_dropin(dropin):
dropin_class = getattr(dropin, dropin_class_name)()
dropin.extractor = self
return self._dropins.setdefault(dropin_name, dropin_class)
try:
@@ -340,7 +356,7 @@ class GenericExtractor(Extractor):
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp()
@@ -359,7 +375,7 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp):
logger.debug(
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
)
try:

View File

@@ -38,6 +38,9 @@ class Tiktok(GenericDropin):
api_data["video_url"] = video_url
return api_data
def keys_to_clean(self, video_data: dict, info_extractor):
return ["video_url", "title", "create_time", "author", "cover", "origin_cover", "ai_dynamic_cover", "duration"]
def create_metadata(self, post: dict, ie_instance, archiver, url):
# prepare result, start by downloading video
result = Metadata()
@@ -54,17 +57,17 @@ class Tiktok(GenericDropin):
logger.error(f"failed to download video from {video_url}")
return False
video_media = Media(video_downloaded)
if duration := post.pop("duration", None):
if duration := post.get("duration", None):
video_media.set("duration", duration)
result.add_media(video_media)
# add remaining metadata
result.set_title(post.pop("title", ""))
result.set_title(post.get("title", ""))
if created_at := post.pop("create_time", None):
if created_at := post.get("create_time", None):
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
if author := post.pop("author", None):
if author := post.get("author", None):
result.set("author", author)
result.set("api_data", post)