Create facebook dropin - working for images + text. CAVEAT: only gets the first ~100 chars of the post at the moment

2026-06-12 21:28:29 +03:00 · 2025-02-25 11:44:35 +00:00
parent 5211c5de18
commit f8e846d59a
5 changed files with 110 additions and 23 deletions
--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@@ -1,3 +1,4 @@
+from typing import Type
 from yt_dlp.extractor.common import InfoExtractor
 from auto_archiver.core.metadata import Metadata
 from auto_archiver.core.extractor import Extractor
@@ -23,6 +24,8 @@ class GenericDropin:

    """

+    extractor: Type[Extractor] = None
+
    def extract_post(self, url: str, ie_instance: InfoExtractor):
        """
        This method should return the post data from the url.
@@ -55,4 +58,11 @@ class GenericDropin:
        """
        This method should download any additional media from the post.
        """
-        return metadata
+        return metadata
+    
+    def is_suitable(self, url, info_extractor: InfoExtractor):
+        """
+        Used to override the InfoExtractor's 'is_suitable' method. Dropins should override this method to return True if the url is suitable for the extractor
+        (based on being able to parse other URLs)
+        """
+        return False
--- a/src/auto_archiver/modules/generic_extractor/facebook.py
+++ b/src/auto_archiver/modules/generic_extractor/facebook.py
@@ -1,18 +1,36 @@
+import re
 from .dropin import GenericDropin
-
+from auto_archiver.core.metadata import Metadata
+from auto_archiver.core.media import Media

 class Facebook(GenericDropin):
-    def extract_post(self, url: str, ie_instance):
-        video_id = ie_instance._match_valid_url(url).group('id')
-        ie_instance._download_webpage(
-            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
-        webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group('id'))
-
-        # TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
-        post_data = ie_instance._extract_metadata(webpage)
-        return post_data
    
+    def extract_post(self, url: str, ie_instance):
+        post_id_regex = r'(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))'
+        post_id = re.search(post_id_regex, url).group('id')
+        webpage = ie_instance._download_webpage(
+            url.replace('://m.facebook.com/', '://www.facebook.com/'), post_id)
+
+        # WARN: Will only work once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
+        # TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...
+        post_data = ie_instance._extract_metadata(webpage, post_id)
+        return post_data
+
    def create_metadata(self, post: dict, ie_instance, archiver, url):
-        metadata = archiver.create_metadata(url)
-        metadata.set_title(post.get('title')).set_content(post.get('description')).set_post_data(post)
-        return metadata
+        result = Metadata()
+        result.set_content(post.get('description', ''))
+        result.set_title(post.get('title', ''))
+        result.set('author', post.get('uploader', ''))
+        result.set_url(url)
+        return result
+    
+    def is_suitable(self, url, info_extractor):
+        regex = r'(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)'
+        return re.match(regex, url)
+    
+    def skip_ytdlp_download(self, url: str, ie_instance):
+        """
+        Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
+        """
+        if re.search(r'/t.\d+/\d+', url):
+            return True
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -8,6 +8,8 @@ from loguru import logger
 from auto_archiver.core.extractor import Extractor
 from auto_archiver.core import Metadata, Media

+class Skip(Exception):
+    pass
 class GenericExtractor(Extractor):
    _dropins = {}

@@ -15,8 +17,20 @@ class GenericExtractor(Extractor):
        """
        Returns a list of valid extractors for the given URL"""
        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
-            if info_extractor.suitable(url) and info_extractor.working():
+            if not info_extractor.working():
+                continue
+
+            # check if there's a dropin and see if that declares whether it's suitable
+            dropin = self.dropin_for_name(info_extractor.ie_key())
+            if dropin and dropin.is_suitable(url, info_extractor):
                yield info_extractor
+                continue
+
+            if info_extractor.suitable(url):
+                yield info_extractor
+                continue
+            
+
        
    def suitable(self, url: str) -> bool:
        """
@@ -129,7 +143,8 @@ class GenericExtractor(Extractor):
            return False
        
        post_data = dropin.extract_post(url, ie_instance)
-        return dropin.create_metadata(post_data, ie_instance, self, url)
+        result = dropin.create_metadata(post_data, ie_instance, self, url)
+        return self.add_metadata(post_data, info_extractor, url, result)

    def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:

@@ -181,6 +196,7 @@ class GenericExtractor(Extractor):
        dropin_class_name = dropin_name.title()
        def _load_dropin(dropin):
            dropin_class = getattr(dropin, dropin_class_name)()
+            dropin.extractor = self
            return self._dropins.setdefault(dropin_name, dropin_class)

        try:
@@ -225,8 +241,9 @@ class GenericExtractor(Extractor):
        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())

        try:
-            if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
-                raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+            if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
+                logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()} (dropin override)")
+                raise Skip()

            # don't download since it can be a live stream
            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
@@ -240,15 +257,17 @@ class GenericExtractor(Extractor):
            if info_extractor.ie_key() == "generic":
                # don't clutter the logs with issues about the 'generic' extractor not having a dropin
                return False
+            
+            if not isinstance(e, Skip):
+                logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead')

-            logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
            try:
                result = self.get_metadata_for_post(info_extractor, url, ydl)
            except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
                logger.error(f'Error downloading metadata for post: {post_e}')
                return False
            except Exception as generic_e:
-                logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
+                logger.debug(f'Attempt to extract using ytdlp dropin for "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
                return False
        
        if result:
--- a/src/auto_archiver/modules/local_storage/manifest.py
+++ b/src/auto_archiver/modules/local_storage/manifest.py
@@ -17,7 +17,7 @@
            "choices": ["random", "static"],
        },
        "save_to": {"default": "./local_archive", "help": "folder where to save archived content"},
-        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+        "save_absolute": {"default": True, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)"},
    },
    "description": """
    LocalStorage: A storage module for saving archived content locally on the filesystem.