From c3dd19f3092fe4aa88b31293aab85498e6802b2b Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Wed, 15 Jan 2025 17:02:19 +0100
Subject: [PATCH 01/20] Sniff filetype of downloaded media and add extension

Also download in chunks - fixes 2 x TODOs
---
 poetry.lock                             | 16 ++++++++++--
 pyproject.toml                          |  3 ++-
 src/auto_archiver/archivers/archiver.py | 33 +++++++++++++++++++------
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1b31740..97e1035 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -889,6 +889,18 @@ future = "*"
 [package.extras]
 dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
 
+[[package]]
+name = "filetype"
+version = "1.2.0"
+description = "Infer file type and MIME type of any file/buffer. No external dependencies."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"},
+    {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
+]
+
 [[package]]
 name = "flask"
 version = "3.1.0"
@@ -3296,4 +3308,4 @@ test = ["pytest (>=8.1,<9.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "7c7dc6d26e5af1c9bb6e4393b4ac64b155049d20a9f5317baec48c964a2708ac"
+content-hash = "df1bd49271b2682b82da437c2e6ce3842d116aa0fc7769e9ab9958c91a8647b2"
diff --git a/pyproject.toml b/pyproject.toml
index 9fd4547..c5d2a9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,8 @@ dependencies = [
     "retrying (>=0.0.0)",
     "tsp-client (>=0.0.0)",
     "certvalidator (>=0.0.0)",
-    "toml (>=0.10.2,<0.11.0)"
+    "toml (>=0.10.2,<0.11.0)",
+    "filetype (>=1.2.0,<2.0.0)"
 ]
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
index 25e08c3..24bb53c 100644
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
+from pathlib import Path
 from abc import abstractmethod
 from dataclasses import dataclass
+import filetype
 import os
 import mimetypes, requests
 from loguru import logger
@@ -46,10 +48,8 @@ class Archiver(Step):
     @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
     def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
         """
-        downloads a URL to provided filename, or inferred from URL, returns local filename
+            downloads a URL to provided filename, or inferred from URL, returns local filename
         """
-        # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
-        # TODO: should we guess the extension?
         if not to_filename:
             to_filename = url.split('/')[-1].split('?')[0]
             if len(to_filename) > 64:
@@ -59,11 +59,28 @@ class Archiver(Step):
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
         }
-        d = requests.get(url, headers=headers)
-        assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
-        with open(to_filename, 'wb') as f:
-            f.write(d.content)
-        return to_filename
+        try:
+            d = requests.get(url, stream=True, headers=headers)
+            d.raise_for_status()
+
+            # Peek at the first 256 bytes
+            first_256 = d.raw.read(256)
+
+            # Use filetype to guess the extension if there isn't already one
+            if not Path(to_filename).suffix:
+                guessed = filetype.guess(first_256)
+                extension = guessed.extension if guessed else None
+                if extension:
+                    to_filename += f".{extension}"
+
+            with open(to_filename, 'wb') as f:
+                f.write(first_256)
+                for chunk in d.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return to_filename
+        
+        except requests.RequestException as e:
+            logger.warning(f"Failed to fetch the Media URL: {e}")
 
     @abstractmethod
     def download(self, item: Metadata) -> Metadata: pass

From 4f2b9baa73ae047e5da6614fb70afe685e090842 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Wed, 15 Jan 2025 17:39:47 +0100
Subject: [PATCH 02/20] refactor youtubedlp archiver to work for all valid
 websites

1. Extract more metadata
2. Better extract thumbnail
3. Setup framework for specific sites to provide more granular metadata processing
---
 src/auto_archiver/archivers/archiver.py       |   8 ++
 .../archivers/youtubedl_archiver.py           | 136 +++++++++++++++---
 tests/archivers/test_archiver_base.py         |   2 +-
 tests/archivers/test_youtubedl_archiver.py    |  57 ++++++++
 4 files changed, 182 insertions(+), 21 deletions(-)
 create mode 100644 tests/archivers/test_youtubedl_archiver.py

diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
index 24bb53c..6fed8b7 100644
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -34,6 +34,14 @@ class Archiver(Step):
     def sanitize_url(self, url: str) -> str:
         # used to clean unnecessary URL parameters OR unfurl redirect links
         return url
+    
+    def suitable(self, url: str) -> bool:
+        """
+        Returns True if this archiver can handle the given URL
+        
+        Should be overridden by subclasses
+        """
+        return True
 
     def _guess_file_type(self, path: str) -> str:
         """
diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
index b13cceb..97ad569 100644
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -33,7 +33,115 @@ class YoutubeDLArchiver(Archiver):
             "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
             "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
         }
+    
+    def download_additional_media(self, ie: str, video_data: dict, metadata: Metadata) -> Metadata:
+        """
+        Downloads additional media like images, comments, subtitles, etc.
 
+        Creates a 'media' object and attaches it to the metadata object.
+        """
+
+        # TODO: should we download all thumbnails, or just the chosen thumbnail?
+
+        # Right now, just getting the single thumbnail
+        thumbnail_url = video_data.get('thumbnail')
+        if thumbnail_url:
+            try:
+                cover_image_path = self.download_from_url(thumbnail_url)
+                media = Media(cover_image_path)
+                metadata.add_media(media, id="cover")
+            except Exception as e:
+                logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
+
+        return metadata
+
+    def keys_to_clean(self, ie: str, video_data: dict) -> dict:
+        """
+        Clean up the video data to make it more readable and remove unnecessary keys that ytdlp adds
+        """
+
+        base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
+                     'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
+                     'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
+                     'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
+                     'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
+                     'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
+                     '_format_sort_fields', 'chapters', 'uploader_id', 'uploader_url', 'requested_formats', 'format_note',
+                     'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
+        if ie == 'TikTok':
+            return base_keys + []
+        
+        return base_keys
+    
+    def add_metadata(self, ie: str, video_data: dict, url:str, result: Metadata) -> Metadata:
+        """
+        Creates a Metadata object from the give video_data
+        """
+
+        # first add the media
+        result = self.download_additional_media(ie, video_data, result)
+
+        # keep the full title, no need for the shortened title (?)
+        video_data['title'] = video_data.pop('fulltitle', video_data.get('title'))
+        result.set_title(video_data.pop('title', url))
+
+        # then add the platform specific additional metadata
+        for key, mapping in self.video_data_metadata_mapping(ie, video_data).items():
+            if isinstance(mapping, str):
+                result.set(key, eval(f"video_data{mapping}"))
+            elif callable(mapping):
+                result.set(key, mapping(video_data))
+        result.set_url(url)
+
+        # extract comments if enabled
+        if self.comments:
+            result.set("comments", [{
+                "text": c["text"],
+                "author": c["author"], 
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
+            } for c in video_data.get("comments", [])])
+
+        # then add the common metadata
+        if (timestamp := video_data.pop("timestamp", None)):
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
+            result.set_timestamp(timestamp)
+        if (upload_date := video_data.pop("upload_date", None)):
+            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
+            result.set("upload_date", upload_date)
+        
+        # then clean away any keys we don't want
+        for clean_key in self.keys_to_clean(ie, video_data):
+            video_data.pop(clean_key, None)
+        
+        # then add the rest of the video data
+        for k, v in video_data.items():
+            if v:
+                result.set(k, v)
+
+        return result
+
+    def video_data_metadata_mapping(self, ie: str, video_data: dict) -> dict:
+        """
+        Returns a key->value mapping to map from the yt-dlp produced 'video_data' to the Metadata object.
+        Can be either a string for direct mapping, or a function, or a lambda.
+        """
+        return {}
+
+    def suitable(self, item: Metadata) -> bool:
+        """
+        Checks for valid URLs out of all ytdlp extractors.
+        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
+        """
+        url = item.get_url()
+        for ie_key, ie in yt_dlp.YoutubeDL()._ies.items():
+            # Note: this will return True for *all* URLs due to the 'generic' extractor from ytdlp (valid for all URLs).
+            # should we check for the 'GenericIE' extractor and return False?
+            # if ie.IE_NAME == 'generic'... - leaving it in for now, since we also want the ability to download from generic sites
+            # perhaps one solution is to return 'False' initially, and then if no other installed archivers work, we try again using the generic one
+            if ie.suitable(url) and ie.working():
+                return True
+        return False
+    
     def download(self, item: Metadata) -> Metadata:
         url = item.get_url()
 
@@ -70,7 +178,6 @@ class YoutubeDLArchiver(Archiver):
         ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
         #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
         info = ydl.extract_info(url, download=True)
-
         if "entries" in info:
             entries = info.get("entries", [])
             if not len(entries):
@@ -78,9 +185,9 @@ class YoutubeDLArchiver(Archiver):
                 return False
         else: entries = [info]
 
+        ie = info['extractor_key']
         result = Metadata()
-        result.set_title(info.get("title"))
-        if "description" in info: result.set_content(info["description"])
+
         for entry in entries:
             try:
                 filename = ydl.prepare_filename(entry)
@@ -104,22 +211,11 @@ class YoutubeDLArchiver(Archiver):
             except Exception as e:
                 logger.error(f"Error processing entry {entry}: {e}")
 
-        # extract comments if enabled
-        if self.comments:
-            result.set("comments", [{
-                "text": c["text"],
-                "author": c["author"], 
-                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
-            } for c in info.get("comments", [])])
+        result = self.add_metadata(ie, info, url, result)
+        extractor_name = "yt-dlp"
+        if ie:
+            extractor_name += f"--{ie}IE"
 
-        if (timestamp := info.get("timestamp")):
-            #TODO: fix deprecated timestamp, 
-            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
-            result.set_timestamp(timestamp)
-        if (upload_date := info.get("upload_date")):
-            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
-            result.set("upload_date", upload_date)
-
-        if self.end_means_success: result.success("yt-dlp")
-        else: result.status = "yt-dlp"
+        if self.end_means_success: result.success(extractor_name)
+        else: result.status = extractor_name
         return result
diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py
index 3c9ffbd..ed77739 100644
--- a/tests/archivers/test_archiver_base.py
+++ b/tests/archivers/test_archiver_base.py
@@ -13,7 +13,7 @@ class TestArchiverBase(object):
     def setup_archiver(self):
         assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
         assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.archiver = self.archiver_class(self.config)
+        self.archiver = self.archiver_class({self.archiver_class.name: self.config})
     
     def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
         assert test_response is not False
diff --git a/tests/archivers/test_youtubedl_archiver.py b/tests/archivers/test_youtubedl_archiver.py
new file mode 100644
index 0000000..f9f1d76
--- /dev/null
+++ b/tests/archivers/test_youtubedl_archiver.py
@@ -0,0 +1,57 @@
+import pytest
+from pathlib import Path
+
+from auto_archiver.archivers.youtubedl_archiver import YoutubeDLArchiver
+
+from .test_archiver_base import TestArchiverBase
+
+class TestYoutubeDLArchiver(TestArchiverBase):
+    """Tests YoutubeDL Archiver
+    """
+    archiver_class = YoutubeDLArchiver
+    config = {
+        'subtitles': False,
+        'comments': False,
+        'livestreams': False,
+        'live_from_start': False,
+        'end_means_success': True,
+        'allow_playlist': False,
+        'max_downloads': "inf",
+        'proxy': None,
+        'cookies_from_browser': False,
+        'cookie_file': None,
+        }
+
+    @pytest.mark.parametrize("url, is_suitable", [
+        ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
+        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
+        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
+        ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
+        ("https://www.twitch.tv/videos/1167226570", True),
+        ("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
+        ("https://google.com", True)])
+    def test_suitable_urls(self, make_item, url, is_suitable):
+        """
+            Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
+            This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
+            and then if and only if all archivers fails, does it fall back to the generic archiver)
+        """
+        assert self.archiver.suitable(make_item(url)) == is_suitable
+
+    @pytest.mark.download
+    def test_download_tiktok(self, make_item):
+        item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
+        result = self.archiver.download(item)
+        assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
+    
+    @pytest.mark.download
+    def test_download_youtube(self, make_item):
+        # url https://www.youtube.com/watch?v=5qap5aO4i9A
+        item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
+        result = self.archiver.download(item)
+        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
+        assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
+        assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        assert len(result.media) == 2
+        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
+        assert Path(result.media[1].filename).name == "hqdefault.jpg"
\ No newline at end of file

From 3ff7a9444dae760c1706b7b7cc3816fb39116dd1 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Wed, 15 Jan 2025 17:58:07 +0100
Subject: [PATCH 03/20] Update yt-dlp to latest version (2025.1.12) to add bsky
 support

---
 poetry.lock    | 73 ++++++++------------------------------------------
 pyproject.toml |  2 +-
 2 files changed, 12 insertions(+), 63 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index f733d5e..b7e811a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -132,7 +132,7 @@ description = "Python bindings for the Brotli compression library"
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "implementation_name == \"cpython\" or platform_python_implementation >= \"CPython\""
+markers = "platform_python_implementation >= \"CPython\""
 files = [
     {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752"},
     {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9"},
@@ -261,47 +261,6 @@ files = [
     {file = "Brotli-1.1.0.tar.gz", hash = "sha256:81de08ac11bcb85841e440c13611c00b67d3bf82698314928d0b676362546724"},
 ]
 
-[[package]]
-name = "brotlicffi"
-version = "1.1.0.0"
-description = "Python CFFI bindings to the Brotli library"
-optional = false
-python-versions = ">=3.7"
-groups = ["main"]
-markers = "implementation_name != \"cpython\""
-files = [
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9b7ae6bd1a3f0df532b6d67ff674099a96d22bc0948955cb338488c31bfb8851"},
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19ffc919fa4fc6ace69286e0a23b3789b4219058313cf9b45625016bf7ff996b"},
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9feb210d932ffe7798ee62e6145d3a757eb6233aa9a4e7db78dd3690d7755814"},
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84763dbdef5dd5c24b75597a77e1b30c66604725707565188ba54bab4f114820"},
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-win32.whl", hash = "sha256:1b12b50e07c3911e1efa3a8971543e7648100713d4e0971b13631cce22c587eb"},
-    {file = "brotlicffi-1.1.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:994a4f0681bb6c6c3b0925530a1926b7a189d878e6e5e38fae8efa47c5d9c613"},
-    {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2e4aeb0bd2540cb91b069dbdd54d458da8c4334ceaf2d25df2f4af576d6766ca"},
-    {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b7b0033b0d37bb33009fb2fef73310e432e76f688af76c156b3594389d81391"},
-    {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54a07bb2374a1eba8ebb52b6fafffa2afd3c4df85ddd38fcc0511f2bb387c2a8"},
-    {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7901a7dc4b88f1c1475de59ae9be59799db1007b7d059817948d8e4f12e24e35"},
-    {file = "brotlicffi-1.1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce01c7316aebc7fce59da734286148b1d1b9455f89cf2c8a4dfce7d41db55c2d"},
-    {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:246f1d1a90279bb6069de3de8d75a8856e073b8ff0b09dcca18ccc14cec85979"},
-    {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc4bc5d82bc56ebd8b514fb8350cfac4627d6b0743382e46d033976a5f80fab6"},
-    {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37c26ecb14386a44b118ce36e546ce307f4810bc9598a6e6cb4f7fca725ae7e6"},
-    {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca72968ae4eaf6470498d5c2887073f7efe3b1e7d7ec8be11a06a79cc810e990"},
-    {file = "brotlicffi-1.1.0.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:add0de5b9ad9e9aa293c3aa4e9deb2b61e99ad6c1634e01d01d98c03e6a354cc"},
-    {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9b6068e0f3769992d6b622a1cd2e7835eae3cf8d9da123d7f51ca9c1e9c333e5"},
-    {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8557a8559509b61e65083f8782329188a250102372576093c88930c875a69838"},
-    {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a7ae37e5d79c5bdfb5b4b99f2715a6035e6c5bf538c3746abc8e26694f92f33"},
-    {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391151ec86bb1c683835980f4816272a87eaddc46bb91cbf44f62228b84d8cca"},
-    {file = "brotlicffi-1.1.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2f3711be9290f0453de8eed5275d93d286abe26b08ab4a35d7452caa1fef532f"},
-    {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a807d760763e398bbf2c6394ae9da5815901aa93ee0a37bca5efe78d4ee3171"},
-    {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa8ca0623b26c94fccc3a1fdd895be1743b838f3917300506d04aa3346fd2a14"},
-    {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de0cf28a53a3238b252aca9fed1593e9d36c1d116748013339f0949bfc84112"},
-    {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6be5ec0e88a4925c91f3dea2bb0013b3a2accda6f77238f76a34a1ea532a1cb0"},
-    {file = "brotlicffi-1.1.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d9eb71bb1085d996244439154387266fd23d6ad37161f6f52f1cd41dd95a3808"},
-    {file = "brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13"},
-]
-
-[package.dependencies]
-cffi = ">=1.0.0"
-
 [[package]]
 name = "bs4"
 version = "0.0.2"
@@ -2405,37 +2364,27 @@ h11 = ">=0.9.0,<1"
 
 [[package]]
 name = "yt-dlp"
-version = "2024.9.27"
+version = "2025.1.12"
 description = "A feature-rich command-line audio/video downloader"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "yt_dlp-2024.9.27-py3-none-any.whl", hash = "sha256:2717468dd697fcfcf9a89f493ba30a3830cdfb276c09750e5b561b08b9ef5f69"},
-    {file = "yt_dlp-2024.9.27.tar.gz", hash = "sha256:86605542e17e2e23ad23145b637ec308133762a15a5dedac4ae50b7973237026"},
+    {file = "yt_dlp-2025.1.12-py3-none-any.whl", hash = "sha256:f7ea19afb64f8e457a1b9598ddb67f8deaa313bf1d57abd5612db9272ab10795"},
+    {file = "yt_dlp-2025.1.12.tar.gz", hash = "sha256:8e7e246e2a5a2cff0a9c13db46844a37a547680702012058c94ec18fce0ca25a"},
 ]
 
-[package.dependencies]
-brotli = {version = "*", markers = "implementation_name == \"cpython\""}
-brotlicffi = {version = "*", markers = "implementation_name != \"cpython\""}
-certifi = "*"
-mutagen = "*"
-pycryptodomex = "*"
-requests = ">=2.32.2,<3"
-urllib3 = ">=1.26.17,<3"
-websockets = ">=13.0"
-
 [package.extras]
 build = ["build", "hatchling", "pip", "setuptools (>=71.0.2)", "wheel"]
 curl-cffi = ["curl-cffi (==0.5.10)", "curl-cffi (>=0.5.10,!=0.6.*,<0.7.2)"]
-dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "ruff (>=0.6.0,<0.7.0)"]
-py2exe = ["py2exe (>=0.12)"]
-pyinstaller = ["pyinstaller (>=6.10.0)"]
+default = ["brotli", "brotlicffi", "certifi", "mutagen", "pycryptodomex", "requests (>=2.32.2,<3)", "urllib3 (>=1.26.17,<3)", "websockets (>=13.0)"]
+dev = ["autopep8 (>=2.0,<3.0)", "pre-commit", "pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)", "ruff (>=0.9.0,<0.10.0)"]
+pyinstaller = ["pyinstaller (>=6.11.1)"]
 secretstorage = ["cffi", "secretstorage"]
-static-analysis = ["autopep8 (>=2.0,<3.0)", "ruff (>=0.6.0,<0.7.0)"]
-test = ["pytest (>=8.1,<9.0)"]
+static-analysis = ["autopep8 (>=2.0,<3.0)", "ruff (>=0.9.0,<0.10.0)"]
+test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "a39c87f2632c852d99bb0b684de80e4ef66994cdc73e49bec67790eb8c5fc847"
+content-hash = "1c421ff71f62bd25d3c25efd6c6b49d95446243e352a4111fd9e7462c4aeb704"
diff --git a/pyproject.toml b/pyproject.toml
index 245a47f..5a27cd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
     "cryptography (>=41.0.0,<42.0.0)",
     "boto3 (>=1.28.0,<2.0.0)",
     "dataclasses-json (>=0.0.0)",
-    "yt-dlp (==2024.09.27)",
+    "yt-dlp (==2025.1.12)",
     "numpy (==2.1.3)",
     "vk-url-scraper (>=0.0.0)",
     "requests[socks] (>=0.0.0)",

From 5626bba8159d8c645b5a7cb66c6f5df7538da514 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Wed, 15 Jan 2025 18:31:20 +0100
Subject: [PATCH 04/20] Add test on bluesky and note on why it doesn't work

---
 tests/archivers/test_youtubedl_archiver.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/archivers/test_youtubedl_archiver.py b/tests/archivers/test_youtubedl_archiver.py
index f9f1d76..35c791d 100644
--- a/tests/archivers/test_youtubedl_archiver.py
+++ b/tests/archivers/test_youtubedl_archiver.py
@@ -54,4 +54,10 @@ class TestYoutubeDLArchiver(TestArchiverBase):
         assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
         assert len(result.media) == 2
         assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
-        assert Path(result.media[1].filename).name == "hqdefault.jpg"
\ No newline at end of file
+        assert Path(result.media[1].filename).name == "hqdefault.jpg"
+
+    @pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
+    @pytest.mark.download
+    def test_download_bluesky_with_images(self, make_item):
+        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
+        result = self.archiver.download(item)
\ No newline at end of file

From 3168bed0d9ab6d085c5075370a3541b70da8a02d Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Wed, 15 Jan 2025 19:00:57 +0100
Subject: [PATCH 05/20] Add (skipped) test for twitter extraction with
 youtubedlp

---
 tests/archivers/test_youtubedl_archiver.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/archivers/test_youtubedl_archiver.py b/tests/archivers/test_youtubedl_archiver.py
index 35c791d..bb5a8d2 100644
--- a/tests/archivers/test_youtubedl_archiver.py
+++ b/tests/archivers/test_youtubedl_archiver.py
@@ -60,4 +60,12 @@ class TestYoutubeDLArchiver(TestArchiverBase):
     @pytest.mark.download
     def test_download_bluesky_with_images(self, make_item):
         item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
-        result = self.archiver.download(item)
\ No newline at end of file
+        result = self.archiver.download(item)
+        assert result is not False
+
+    @pytest.mark.skip("ytdlp supports twitter, but there's currently no way to extract info from pages without videos")
+    @pytest.mark.download
+    def test_download_twitter_textonly(self, make_item):
+        item = make_item("https://x.com/bellingcat/status/1874097816571961839")
+        result = self.archiver.download(item)
+        assert result is not False
\ No newline at end of file

From 394bcd8d47a3c04d68a34a0d6a06912ad635ae0e Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 11:56:08 +0100
Subject: [PATCH 06/20] Further refactoring of
 youtubedl_archiver->base_archiver

* Keep twitter_api_archiver
* Remove unit tests for obsolete archivers
* Guess filename of media using the 'Content-Type' header
* Add mechanism to run 'expensive' tests last (see conftest.py) and also flag expensive tests to fail straight off (pytest.mark.incremental)
---
 .github/workflows/tests-download.yaml         |   2 +-
 poetry.lock                                   |  14 +-
 pyproject.toml                                |   1 -
 src/auto_archiver/archivers/__init__.py       |   7 +-
 src/auto_archiver/archivers/archiver.py       |  16 +-
 .../archivers/base_archiver/__init__.py       |   1 +
 .../archivers/base_archiver/base_archiver.py  | 296 ++++++++++++++++++
 .../archivers/base_archiver/bluesky.py        |  88 ++++++
 .../archivers/base_archiver/twitter.py        |  62 ++++
 .../archivers/bluesky_archiver.py             | 119 -------
 .../archivers/tiktok_archiver.py              |  55 ----
 .../archivers/twitter_api_archiver.py         |  40 ++-
 .../archivers/twitter_archiver.py             | 209 -------------
 .../archivers/youtubedl_archiver.py           | 223 +------------
 tests/archivers/test_base_archiver.py         | 141 +++++++++
 tests/archivers/test_bluesky_archiver.py      |  73 -----
 tests/archivers/test_tiktok_archiver.py       |  17 -
 ...chiver.py => test_twitter_api_archiver.py} |  90 ++----
 tests/archivers/test_youtubedl_archiver.py    |  71 -----
 tests/conftest.py                             |  72 ++++-
 20 files changed, 735 insertions(+), 862 deletions(-)
 create mode 100644 src/auto_archiver/archivers/base_archiver/__init__.py
 create mode 100644 src/auto_archiver/archivers/base_archiver/base_archiver.py
 create mode 100644 src/auto_archiver/archivers/base_archiver/bluesky.py
 create mode 100644 src/auto_archiver/archivers/base_archiver/twitter.py
 delete mode 100644 src/auto_archiver/archivers/bluesky_archiver.py
 delete mode 100644 src/auto_archiver/archivers/tiktok_archiver.py
 delete mode 100644 src/auto_archiver/archivers/twitter_archiver.py
 create mode 100644 tests/archivers/test_base_archiver.py
 delete mode 100644 tests/archivers/test_bluesky_archiver.py
 delete mode 100644 tests/archivers/test_tiktok_archiver.py
 rename tests/archivers/{test_twitter_archiver.py => test_twitter_api_archiver.py} (55%)
 delete mode 100644 tests/archivers/test_youtubedl_archiver.py

diff --git a/.github/workflows/tests-download.yaml b/.github/workflows/tests-download.yaml
index 2a1de73..fc31f42 100644
--- a/.github/workflows/tests-download.yaml
+++ b/.github/workflows/tests-download.yaml
@@ -35,4 +35,4 @@ jobs:
         run: poetry install --no-interaction --with dev
 
       - name: Run Download Tests
-        run: poetry run pytest -ra -v -m "download"
+        run: poetry run pytest -ra -v -x -m "download"
diff --git a/poetry.lock b/poetry.lock
index b7e811a..adb2726 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -630,18 +630,6 @@ future = "*"
 [package.extras]
 dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"]
 
-[[package]]
-name = "filetype"
-version = "1.2.0"
-description = "Infer file type and MIME type of any file/buffer. No external dependencies."
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
-    {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"},
-    {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
-]
-
 [[package]]
 name = "future"
 version = "1.0.0"
@@ -2387,4 +2375,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "1c421ff71f62bd25d3c25efd6c6b49d95446243e352a4111fd9e7462c4aeb704"
+content-hash = "99800b85fc1678ba4eca510a3c01ba273f229644b08c711a2e466845794abf38"
diff --git a/pyproject.toml b/pyproject.toml
index 5a27cd6..995024a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,6 @@ dependencies = [
     "retrying (>=0.0.0)",
     "tsp-client (>=0.0.0)",
     "certvalidator (>=0.0.0)",
-    "filetype (>=1.2.0,<2.0.0)",
 ]
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
index 996ca3b..24dde91 100644
--- a/src/auto_archiver/archivers/__init__.py
+++ b/src/auto_archiver/archivers/__init__.py
@@ -1,12 +1,9 @@
 from .archiver import Archiver
 from .telethon_archiver import TelethonArchiver
-from .twitter_archiver import TwitterArchiver
 from .twitter_api_archiver import TwitterApiArchiver
 from .instagram_archiver import InstagramArchiver
 from .instagram_tbot_archiver import InstagramTbotArchiver
-from .tiktok_archiver import TiktokArchiver
 from .telegram_archiver import TelegramArchiver
 from .vk_archiver import VkArchiver
-from .youtubedl_archiver import YoutubeDLArchiver
-from .instagram_api_archiver import InstagramAPIArchiver
-from .bluesky_archiver import BlueskyArchiver
\ No newline at end of file
+from .base_archiver.base_archiver import BaseArchiver as YoutubeDLArchiver
+from .instagram_api_archiver import InstagramAPIArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
index 6fed8b7..911389a 100644
--- a/src/auto_archiver/archivers/archiver.py
+++ b/src/auto_archiver/archivers/archiver.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 from pathlib import Path
 from abc import abstractmethod
 from dataclasses import dataclass
-import filetype
+import mimetypes
 import os
 import mimetypes, requests
 from loguru import logger
@@ -68,21 +68,17 @@ class Archiver(Step):
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
         }
         try:
-            d = requests.get(url, stream=True, headers=headers)
+            d = requests.get(url, stream=True, headers=headers, timeout=30)
             d.raise_for_status()
 
-            # Peek at the first 256 bytes
-            first_256 = d.raw.read(256)
-
-            # Use filetype to guess the extension if there isn't already one
+            # get mimetype from the response headers
             if not Path(to_filename).suffix:
-                guessed = filetype.guess(first_256)
-                extension = guessed.extension if guessed else None
+                content_type = d.headers.get('Content-Type')
+                extension = mimetypes.guess_extension(content_type)
                 if extension:
-                    to_filename += f".{extension}"
+                    to_filename += extension
 
             with open(to_filename, 'wb') as f:
-                f.write(first_256)
                 for chunk in d.iter_content(chunk_size=8192):
                     f.write(chunk)
             return to_filename
diff --git a/src/auto_archiver/archivers/base_archiver/__init__.py b/src/auto_archiver/archivers/base_archiver/__init__.py
new file mode 100644
index 0000000..15ee4eb
--- /dev/null
+++ b/src/auto_archiver/archivers/base_archiver/__init__.py
@@ -0,0 +1 @@
+from .base_archiver import BaseArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/base_archiver/base_archiver.py b/src/auto_archiver/archivers/base_archiver/base_archiver.py
new file mode 100644
index 0000000..b1cbabd
--- /dev/null
+++ b/src/auto_archiver/archivers/base_archiver/base_archiver.py
@@ -0,0 +1,296 @@
+import datetime, os, yt_dlp, pysubs2
+from typing import Type
+from yt_dlp.extractor.common import InfoExtractor
+
+from loguru import logger
+
+from . import bluesky, twitter
+from auto_archiver.archivers.archiver import Archiver
+from ...core import Metadata, Media, ArchivingContext
+
+
+class BaseArchiver(Archiver):
+    name = "youtubedl_archiver" #left as is for backwards compat
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+        self.subtitles = bool(self.subtitles)
+        self.comments = bool(self.comments)
+        self.livestreams = bool(self.livestreams)
+        self.live_from_start = bool(self.live_from_start)
+        self.end_means_success = bool(self.end_means_success)
+        self.allow_playlist = bool(self.allow_playlist)
+        self.max_downloads = self.max_downloads
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
+            "subtitles": {"default": True, "help": "download subtitles if available"},
+            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
+            "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
+            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
+            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
+            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
+            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
+            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
+            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
+            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
+        }
+    
+    def download_additional_media(self, extractor_key: str, video_data: dict, metadata: Metadata) -> Metadata:
+        """
+        Downloads additional media like images, comments, subtitles, etc.
+
+        Creates a 'media' object and attaches it to the metadata object.
+        """
+
+        # Just get the main thumbnail. More thumbnails are available in
+        # video_data['thumbnails'] should they be required
+        thumbnail_url = video_data.get('thumbnail')
+        if thumbnail_url:
+            try:
+                cover_image_path = self.download_from_url(thumbnail_url)
+                media = Media(cover_image_path)
+                metadata.add_media(media, id="cover")
+            except Exception as e:
+                logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
+
+        return metadata
+
+    def keys_to_clean(self, extractor_key: str, video_data: dict) -> dict:
+        """
+        Clean up the video data to make it more readable and remove unnecessary keys that ytdlp adds
+        """
+
+        base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
+                     'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
+                     'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
+                     'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
+                     'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
+                     'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
+                     '_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
+                     'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
+        if extractor_key == 'TikTok':
+            # Tiktok: only has videos so a valid ytdlp `video_data` object is returned. Base keys are enough
+            return base_keys + [] 
+        elif extractor_key == "Bluesky":
+            # bluesky API response for non video URLs is already clean, nothing to add
+            return base_keys + [] 
+        
+        return base_keys
+    
+    def add_metadata(self, extractor_key: str, video_data: dict, url:str, result: Metadata) -> Metadata:
+        """
+        Creates a Metadata object from the give video_data
+        """
+
+        # first add the media
+        result = self.download_additional_media(extractor_key, video_data, result)
+
+        # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
+        result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
+
+        # then add the platform specific additional metadata
+        for key, mapping in self.video_data_metadata_mapping(extractor_key, video_data).items():
+            if isinstance(mapping, str):
+                result.set(key, eval(f"video_data{mapping}"))
+            elif callable(mapping):
+                result.set(key, mapping(video_data))
+        result.set_url(url)
+
+        # extract comments if enabled
+        if self.comments:
+            result.set("comments", [{
+                "text": c["text"],
+                "author": c["author"], 
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
+            } for c in video_data.get("comments", [])])
+
+        # then add the common metadata
+        if timestamp := video_data.pop("timestamp", None):
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
+            result.set_timestamp(timestamp)
+        if upload_date := video_data.pop("upload_date", None):
+            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
+            result.set("upload_date", upload_date)
+        
+        # then clean away any keys we don't want
+        for clean_key in self.keys_to_clean(extractor_key, video_data):
+            video_data.pop(clean_key, None)
+        
+        # then add the rest of the video data
+        for k, v in video_data.items():
+            if v:
+                result.set(k, v)
+
+        return result
+
+    def video_data_metadata_mapping(self, extractor_key: str, video_data: dict) -> dict:
+        """
+        Returns a key->value mapping to map from the yt-dlp produced 'video_data' to the Metadata object.
+        Can be either a string for direct mapping, or a function, or a lambda.
+        """
+        return {}
+    
+    def suitable_extractors(self, url: str) -> list[str]:
+        """
+        Returns a list of valid extractors for the given URL"""
+        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
+            if info_extractor.suitable(url) and info_extractor.working():
+                yield info_extractor
+        
+    def suitable(self, url: str) -> bool:
+        """
+        Checks for valid URLs out of all ytdlp extractors.
+        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
+        """
+        return any(self.suitable_extractors(url))
+
+    def create_metadata_for_post(self, info_extractor: InfoExtractor, video_data: dict, url: str) -> Metadata:
+        """
+        Standardizes the output of the ytdlp InfoExtractor to a common format
+        """
+        if info_extractor.ie_key() == 'Bluesky':
+            return bluesky.create_metadata(video_data, self, url)
+        if info_extractor.ie_key() == 'Twitter':
+            return twitter.create_metadata(video_data, self, url)
+
+    def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        """
+        Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
+        """
+
+        ie_instance = info_extractor(downloader=ydl)
+        post_data = None
+
+        if info_extractor.ie_key() == 'Bluesky':
+            # bluesky kwargs are handle, video_id
+            handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+            post_data = ie_instance._extract_post(handle=handle, post_id=video_id)
+        elif info_extractor.ie_key() == 'Twitter':
+            # twitter kwargs are tweet_id
+            twid = ie_instance._match_valid_url(url).group('id')
+            # TODO: if ytdlp PR https://github.com/yt-dlp/yt-dlp/pull/12098 is merged, change to _extract_post
+            post_data = ie_instance._extract_status(twid=twid)
+
+        elif info_extractor.ie_key() == 'TikTok':
+            pass
+
+        else:
+            # lame attempt at trying to get data for an unknown extractor
+            # TODO: test some more video platforms and see if there's any improvement to be made
+            try:
+                post_data = ie_instance._extract_post(url)
+            except (NotImplementedError, AttributeError) as e:
+                logger.debug(f"Extractor {info_extractor.ie_key()} does not support extracting post info: {e}")
+                return False
+
+        return self.create_metadata_for_post(ie_instance, post_data, url)
+        
+    def get_metatdata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+
+        # this time download
+        ydl.params['getcomments'] = self.comments
+        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
+        info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        if "entries" in info:
+            entries = info.get("entries", [])
+            if not len(entries):
+                logger.warning('YoutubeDLArchiver could not find any video')
+                return False
+        else: entries = [info]
+
+        extractor_key = info['extractor_key']
+        result = Metadata()
+
+        for entry in entries:
+            try:
+                filename = ydl.prepare_filename(entry)
+                if not os.path.exists(filename):
+                    filename = filename.split('.')[0] + '.mkv'
+
+                new_media = Media(filename)
+                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
+                    if x in entry: new_media.set(x, entry[x])
+
+                # read text from subtitles if enabled
+                if self.subtitles:
+                    for lang, val in (info.get('requested_subtitles') or {}).items():
+                        try:    
+                            subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
+                            text = " ".join([line.text for line in subs])
+                            new_media.set(f"subtitles_{lang}", text)
+                        except Exception as e:
+                            logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
+                result.add_media(new_media)
+            except Exception as e:
+                logger.error(f"Error processing entry {entry}: {e}")
+
+        return self.add_metadata(extractor_key, info, url, result)
+
+    def download_for_extractor(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        """
+        Tries to download the given url using the specified extractor
+        
+        It first tries to use ytdlp directly to download the video. If the post is not a video, it will then try to
+        use the extractor's _extract_post method to get the post metadata if possible.
+        """
+        # when getting info without download, we also don't need the comments
+        ydl.params['getcomments'] = False
+        result = False
+
+        try:
+            # don't download since it can be a live stream
+            info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
+            if info.get('is_live', False) and not self.livestreams:
+                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
+                return False
+            # it's a valid video, that the youtubdedl can download out of the box
+            result = self.get_metatdata_for_video(info, info_extractor, url, ydl)
+
+        except yt_dlp.utils.DownloadError as e:
+            logger.debug(f'No video found, attempting to use extractor directly: {e}')
+            result = self.get_metatdata_for_post(info_extractor, url, ydl)
+        except Exception as e:
+            logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception is: \n  {e}')
+            return False
+        
+        if result:
+            extractor_name = "yt-dlp"
+            if info_extractor:
+                extractor_name += f"_{info_extractor.ie_key()}"
+
+            if self.end_means_success:
+                result.success(extractor_name)
+            else:
+                result.status = extractor_name
+
+        return result
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+
+        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
+            logger.debug('Using Facebook cookie')
+            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
+        
+        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+
+        if item.netloc in ['youtube.com', 'www.youtube.com']:
+            if self.cookies_from_browser:
+                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
+                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
+            elif self.cookie_file:
+                logger.debug(f'Using cookies from file {self.cookie_file}')
+                ydl_options['cookiefile'] = self.cookie_file
+
+        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
+
+        for info_extractor in self.suitable_extractors(url):
+            result = self.download_for_extractor(info_extractor, url, ydl)
+            if result:
+                return result
+       
+
+        return False
diff --git a/src/auto_archiver/archivers/base_archiver/bluesky.py b/src/auto_archiver/archivers/base_archiver/bluesky.py
new file mode 100644
index 0000000..176808b
--- /dev/null
+++ b/src/auto_archiver/archivers/base_archiver/bluesky.py
@@ -0,0 +1,88 @@
+import os
+import mimetypes
+
+import requests
+from loguru import logger
+
+from auto_archiver.core.context import ArchivingContext
+from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.core.metadata import Metadata, Media
+
+
+def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
+    result = Metadata()
+    result.set_url(url)
+    result.set_title(post["record"]["text"])
+    result.set_timestamp(post["record"]["createdAt"])
+    for k, v in _get_post_data(post).items():
+        if v: result.set(k, v)
+
+    # download if embeds present (1 video XOR >=1 images)
+    for media in _download_bsky_embeds(post):
+        result.add_media(media)
+    logger.debug(f"Downloaded {len(result.media)} media files")
+
+    return result
+
+def _download_bsky_embeds(post: dict) -> list[Media]:
+    """
+    Iterates over image(s) or video in a Bluesky post and downloads them        
+    """
+    media = []
+    embed = post.get("record", {}).get("embed", {})
+    image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
+    video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
+
+    for image_media in image_medias:
+        image_media = _download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"])
+        media.append(image_media)
+    for video_media in video_medias:
+        video_media = _download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"])
+        media.append(video_media)
+    return media
+
+def _download_bsky_file_as_media(cid: str, did: str) -> Media:
+    """
+    Uses the Bluesky API to download a file by its `cid` and `did`.
+    """
+    # TODO: replace with self.download_from_url once that function has been cleaned-up
+    file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}"
+    response = requests.get(file_url, stream=True)
+    response.raise_for_status()
+    ext = mimetypes.guess_extension(response.headers["Content-Type"])
+    filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}")
+    with open(filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+    media = Media(filename=filename)
+    media.set("src", file_url)
+    return media
+
+def _get_post_data(post: dict) -> dict:
+    """
+    Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
+    """
+    author = post["author"]
+    if "labels" in author and not author["labels"]:
+        del author["labels"]
+    if "associated" in author:
+        del author["associated"]
+
+    mentions, tags, links = [], [], []
+    facets = post.get("record", {}).get("facets", [])
+    for f in facets:
+        for feature in f["features"]:
+            if feature["$type"] == "app.bsky.richtext.facet#mention":
+                mentions.append(feature["did"])
+            elif feature["$type"] == "app.bsky.richtext.facet#tag":
+                tags.append(feature["tag"])
+            elif feature["$type"] == "app.bsky.richtext.facet#link":
+                links.append(feature["uri"])
+    res = {"author": author}
+    if mentions:
+        res["mentions"] = mentions
+    if tags:
+        res["tags"] = tags
+    if links:
+        res["links"] = links
+    return res
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/base_archiver/twitter.py b/src/auto_archiver/archivers/base_archiver/twitter.py
new file mode 100644
index 0000000..8cc323c
--- /dev/null
+++ b/src/auto_archiver/archivers/base_archiver/twitter.py
@@ -0,0 +1,62 @@
+import re, mimetypes, json
+from datetime import datetime
+
+from loguru import logger
+from slugify import slugify
+
+from auto_archiver.core.metadata import Metadata, Media
+from auto_archiver.utils import UrlUtil
+from auto_archiver.archivers.archiver import Archiver
+
+
+def choose_variant(variants):
+    # choosing the highest quality possible
+    variant, width, height = None, 0, 0
+    for var in variants:
+        if var.get("content_type", "") == "video/mp4":
+            width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
+            if width_height:
+                w, h = int(width_height[1]), int(width_height[2])
+                if w > width or h > height:
+                    width, height = w, h
+                    variant = var
+        else:
+            variant = var if not variant else variant
+    return variant
+
+def create_metadata(tweet: dict, archiver: Archiver, url: str) -> Metadata:
+    result = Metadata()
+    try:
+        if not tweet.get("user") or not tweet.get("created_at"):
+            raise ValueError(f"Error retreiving post. Are you sure it exists?")
+        timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
+    except (ValueError, KeyError) as ex:
+        logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
+        return False
+            
+    result\
+        .set_title(tweet.get('full_text', ''))\
+        .set_content(json.dumps(tweet, ensure_ascii=False))\
+        .set_timestamp(timestamp)
+    if not tweet.get("entities", {}).get("media"):
+        logger.debug('No media found, archiving tweet text only')
+        result.status = "twitter-ytdl"
+        return result
+    for i, tw_media in enumerate(tweet["entities"]["media"]):
+        media = Media(filename="")
+        mimetype = ""
+        if tw_media["type"] == "photo":
+            media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+            mimetype = "image/jpeg"
+        elif tw_media["type"] == "video":
+            variant = choose_variant(tw_media['video_info']['variants'])
+            media.set("src", variant['url'])
+            mimetype = variant['content_type']
+        elif tw_media["type"] == "animated_gif":
+            variant = tw_media['video_info']['variants'][0]
+            media.set("src", variant['url'])
+            mimetype = variant['content_type']
+        ext = mimetypes.guess_extension(mimetype)
+        media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
+        result.add_media(media)
+    return result
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/bluesky_archiver.py b/src/auto_archiver/archivers/bluesky_archiver.py
deleted file mode 100644
index 534fba2..0000000
--- a/src/auto_archiver/archivers/bluesky_archiver.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import os
-import re, requests, mimetypes
-from loguru import logger
-
-
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-
-
-class BlueskyArchiver(Archiver):
-    """
-    Uses an unauthenticated Bluesky API to archive posts including metadata, images and videos. Relies on `public.api.bsky.app/xrpc` and `bsky.social/xrpc`. Avoids ATProto to avoid auth.
-
-    Some inspiration from https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/bluesky.py
-    """
-    name = "bluesky_archiver"
-    BSKY_POST = re.compile(r"/profile/([^/]+)/post/([a-zA-Z0-9]+)")
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return {}
-
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-        if not re.search(self.BSKY_POST, url):
-            return False
-
-        logger.debug(f"Identified a Bluesky post: {url}, archiving...")
-        result = Metadata()
-
-        # fetch post info and update result
-        post = self._get_post_from_uri(url)
-        logger.debug(f"Extracted post info: {post['record']['text']}")
-        result.set_title(post["record"]["text"])
-        result.set_timestamp(post["record"]["createdAt"])
-        for k, v in self._get_post_data(post).items():
-            if v: result.set(k, v)
-
-        # download if embeds present (1 video XOR >=1 images)
-        for media in self._download_bsky_embeds(post):
-            result.add_media(media)
-        logger.debug(f"Downloaded {len(result.media)} media files")
-
-        return result.success("bluesky")
-
-    def _get_post_from_uri(self, post_uri: str) -> dict:
-        """
-        Calls a public (no auth needed) Bluesky API to get a post from its uri, uses .getPostThread as it brings author info as well (unlike .getPost).
-        """
-        post_match = re.search(self.BSKY_POST, post_uri)
-        username = post_match.group(1)
-        post_id = post_match.group(2)
-        at_uri = f'at://{username}/app.bsky.feed.post/{post_id}'
-        r = requests.get(f"https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread?uri={at_uri}&depth=0&parent_height=0")
-        r.raise_for_status()
-        thread = r.json()
-        assert thread["thread"]["$type"] == "app.bsky.feed.defs#threadViewPost"
-        return thread["thread"]["post"]
-
-    def _download_bsky_embeds(self, post: dict) -> list[Media]:
-        """
-        Iterates over image(s) or video in a Bluesky post and downloads them        
-        """
-        media = []
-        embed = post.get("record", {}).get("embed", {})
-        image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
-        video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
-
-        for image_media in image_medias:
-                image_media = self._download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"])
-                media.append(image_media)
-        for video_media in video_medias:
-            video_media = self._download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"])
-            media.append(video_media)
-        return media
-
-    def _download_bsky_file_as_media(self, cid: str, did: str) -> Media:
-        """
-        Uses the Bluesky API to download a file by its `cid` and `did`.
-        """
-        # TODO: replace with self.download_from_url once that function has been cleaned-up
-        file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}"
-        response = requests.get(file_url, stream=True)
-        response.raise_for_status()
-        ext = mimetypes.guess_extension(response.headers["Content-Type"])
-        filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}")
-        with open(filename, "wb") as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-        media = Media(filename=filename)
-        media.set("src", file_url)
-        return media
-
-    def _get_post_data(self, post: dict) -> dict:
-        """
-        Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
-        """
-        author = post["author"]
-        if "labels" in author and not author["labels"]: del author["labels"]
-        if "associated" in author: del author["associated"]
-
-        mentions, tags, links = [], [], []
-        facets = post.get("record", {}).get("facets", [])
-        for f in facets:
-            for feature in f["features"]:
-                if feature["$type"] == "app.bsky.richtext.facet#mention":
-                    mentions.append(feature["did"])
-                elif feature["$type"] == "app.bsky.richtext.facet#tag":
-                    tags.append(feature["tag"])
-                elif feature["$type"] == "app.bsky.richtext.facet#link":
-                    links.append(feature["uri"])
-        res = {"author": author}
-        if mentions: res["mentions"] = mentions
-        if tags: res["tags"] = tags
-        if links: res["links"] = links
-        return res
diff --git a/src/auto_archiver/archivers/tiktok_archiver.py b/src/auto_archiver/archivers/tiktok_archiver.py
deleted file mode 100644
index fac67d1..0000000
--- a/src/auto_archiver/archivers/tiktok_archiver.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import json, os, traceback
-from loguru import logger
-
-
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-from ..utils.misc import random_str
-
-
-class TiktokArchiver(Archiver):
-    name = "tiktok_archiver"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return {}
-
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-        if 'tiktok.com' not in url:
-            return False
-
-        result = Metadata()
-        try:
-            info = tiktok_downloader.info_post(url)
-            result.set_title(info.desc)
-            result.set_timestamp(info.create_time)
-            result.set_content(json.dumps({
-                "cover": info.cover,
-                "author": info.author,
-                "music_title": info.author,
-                "caption": getattr(info, "caption", info.desc),
-            }, ensure_ascii=False, indent=4))
-        except:
-            error = traceback.format_exc()
-            logger.warning(f'Other Tiktok error {error}')
-
-        try:
-            filename = os.path.join(ArchivingContext.get_tmp_dir(), f'{random_str(8)}.mp4')
-            tiktok_media = tiktok_downloader.snaptik(url).get_media()
-
-            if len(tiktok_media) <= 0:
-                logger.debug(f"TikTok: could not get media from {url=}")
-                return False
-
-            logger.info(f'downloading video {filename=}')
-            tiktok_media[0].download(filename)
-
-            result.add_media(Media(filename))
-            return result.success("tiktok")
-        except:
-            error = traceback.format_exc()
-            logger.warning(f'Other Tiktok error {error}')
diff --git a/src/auto_archiver/archivers/twitter_api_archiver.py b/src/auto_archiver/archivers/twitter_api_archiver.py
index a8c4673..d1e4dee 100644
--- a/src/auto_archiver/archivers/twitter_api_archiver.py
+++ b/src/auto_archiver/archivers/twitter_api_archiver.py
@@ -1,17 +1,19 @@
-
-import json, mimetypes
+import json
+import re
+import mimetypes
+import requests
 from datetime import datetime
+
 from loguru import logger
 from pytwitter import Api
 from slugify import slugify
 
 from . import Archiver
-from .twitter_archiver import TwitterArchiver
 from ..core import Metadata,Media
 
-
-class TwitterApiArchiver(TwitterArchiver, Archiver):
+class TwitterApiArchiver(Archiver):
     name = "twitter_api_archiver"
+    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
 
     def __init__(self, config: dict) -> None:
         super().__init__(config)
@@ -47,6 +49,17 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
     def api_client(self) -> str:
         return self.apis[self.api_index]
     
+    def sanitize_url(self, url: str) -> str:
+        # expand URL if t.co and clean tracker GET params
+        if 'https://t.co/' in url:
+            try:
+                r = requests.get(url, timeout=30)
+                logger.debug(f'Expanded url {url} to {r.url}')
+                url = r.url
+            except:
+                logger.error(f'Failed to expand url {url}')
+        return url
+
 
     def download(self, item: Metadata) -> Metadata:
         # call download retry until success or no more apis
@@ -56,6 +69,16 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
         self.api_index = 0
         return False
 
+    def get_username_tweet_id(self, url):
+        # detect URLs that we definitely cannot handle
+        matches = self.link_pattern.findall(url)
+        if not len(matches): return False, False
+
+        username, tweet_id = matches[0]  # only one URL supported
+        logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
+
+        return username, tweet_id
+
     def download_retry(self, item: Metadata) -> Metadata:
         url = item.get_url()
         # detect URLs that we definitely cannot handle
@@ -102,10 +125,13 @@ class TwitterApiArchiver(TwitterArchiver, Archiver):
             "lang": tweet.data.lang,
             "media": urls
         }, ensure_ascii=False, indent=4))
-        return result.success("twitter")
+        return result.success("twitter-api")
 
     def choose_variant(self, variants):
-        # choosing the highest quality possible
+
+        """
+        Chooses the highest quality variable possible out of a list of variants
+        """
         variant, bit_rate = None, -1
         for var in variants:
             if var.content_type == "video/mp4":
diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py
deleted file mode 100644
index 995910b..0000000
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import re, requests, mimetypes, json, math
-from typing import Union
-from datetime import datetime
-from loguru import logger
-from yt_dlp import YoutubeDL
-from yt_dlp.extractor.twitter import TwitterIE
-from slugify import slugify
-
-from . import Archiver
-from ..core import Metadata, Media
-from ..utils import UrlUtil
-
-
-class TwitterArchiver(Archiver):
-    """
-    This Twitter Archiver uses unofficial scraping methods.
-    """
-
-    name = "twitter_archiver"
-    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
-    link_clean_pattern = re.compile(r"(.+(?:twitter|x)\.com\/.+\/\d+)(\?)*.*")
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-
-    @staticmethod
-    def configs() -> dict:
-        return {}
-
-    def sanitize_url(self, url: str) -> str:
-        # expand URL if t.co and clean tracker GET params
-        if 'https://t.co/' in url:
-            try:
-                r = requests.get(url, timeout=30)
-                logger.debug(f'Expanded url {url} to {r.url}')
-                url = r.url
-            except:
-                logger.error(f'Failed to expand url {url}')
-        # https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w
-        return self.link_clean_pattern.sub("\\1", url)
-
-    def download(self, item: Metadata) -> Metadata:
-        """
-        if this url is archivable will download post info and look for other posts from the same group with media.
-        can handle private/public channels
-        """
-        url = item.get_url()
-        username, tweet_id = self.get_username_tweet_id(url)
-        if not username: return False
-
-        strategies = [self.download_yt_dlp, self.download_syndication]
-        for strategy in strategies:
-            logger.debug(f"Trying {strategy.__name__} for {url=}")
-            try:
-                result = strategy(item, url, tweet_id)
-                if result: return result
-            except Exception as ex:
-                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
-        
-        logger.warning(f"No free strategy worked for {url}")
-        return False
-    
-
-    def generate_token(self, tweet_id: str) -> str:
-        """Generates the syndication token for a tweet ID.
-        
-        Taken from https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
-        And Vercel's code: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27
-        """
-
-        # Perform the division and multiplication by π
-        result = (int(tweet_id) / 1e15) * math.pi
-        fractional_part = result % 1
-
-        # Convert to base 36
-        base_36 = ''
-        while result >= 1:
-            base_36 = "0123456789abcdefghijklmnopqrstuvwxyz"[int(result % 36)] + base_36
-            result = math.floor(result / 36)
-
-        # Append fractional part in base 36
-        while fractional_part > 0 and len(base_36) < 11:  # Limit to avoid infinite loop
-            fractional_part *= 36
-            digit = int(fractional_part)
-            base_36 += "0123456789abcdefghijklmnopqrstuvwxyz"[digit]
-            fractional_part -= digit
-        
-        # Remove leading zeros and dots
-        return base_36.replace('0', '').replace('.', '')
-
-
-    
-    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
-        """
-        Downloads tweets using Twitter's own embed API (Hack).
-
-        Background on method can be found at:
-        https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
-        https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
-        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
-        """
-
-        hack_url = "https://cdn.syndication.twimg.com/tweet-result"
-        params = {
-            'id': tweet_id,
-            'token': self.generate_token(tweet_id)
-        }
-
-        r = requests.get(hack_url, params=params, timeout=10)
-        if r.status_code != 200 or r.json()=={}: 
-            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
-            return False
-        
-        result = Metadata()
-        tweet = r.json()
-
-        if tweet.get('__typename') == 'TweetTombstone':
-            logger.error(f"Failed to get tweet {tweet_id}: {tweet['tombstone']['text']['text']}")
-            return False
-
-        urls = []
-        for p in tweet.get("photos", []):
-            urls.append(p["url"])
-
-        # 1 tweet has 1 video max
-        if "video" in tweet:
-            v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", []))['url'])
-
-        logger.debug(f"Twitter hack got media {urls=}")
-
-        for i, u in enumerate(urls):
-            media = Media(filename="")
-            u = UrlUtil.twitter_best_quality_url(u)
-            media.set("src", u)
-            ext = ""
-            if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
-                ext = mimetypes.guess_extension(mtype)
-
-            media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
-            result.add_media(media)
-        
-        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-syndication")
-
-    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
-        downloader = YoutubeDL()
-        tie = TwitterIE(downloader)
-        tweet = tie._extract_status(tweet_id)
-        result = Metadata()
-        try:
-            if not tweet.get("user") or not tweet.get("created_at"):
-                raise ValueError(f"Error retreiving post with id {tweet_id}. Are you sure it exists?")
-            timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
-        except (ValueError, KeyError) as ex:
-            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
-            return False
-                
-        result\
-            .set_title(tweet.get('full_text', ''))\
-            .set_content(json.dumps(tweet, ensure_ascii=False))\
-            .set_timestamp(timestamp)
-        if not tweet.get("entities", {}).get("media"):
-            logger.debug('No media found, archiving tweet text only')
-            result.status = "twitter-ytdl"
-            return result
-        for i, tw_media in enumerate(tweet["entities"]["media"]):
-            media = Media(filename="")
-            mimetype = ""
-            if tw_media["type"] == "photo":
-                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
-                mimetype = "image/jpeg"
-            elif tw_media["type"] == "video":
-                variant = self.choose_variant(tw_media['video_info']['variants'])
-                media.set("src", variant['url'])
-                mimetype = variant['content_type']
-            elif tw_media["type"] == "animated_gif":
-                variant = tw_media['video_info']['variants'][0]
-                media.set("src", variant['url'])
-                mimetype = variant['content_type']
-            ext = mimetypes.guess_extension(mimetype)
-            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
-            result.add_media(media)
-        return result.success("twitter-ytdl")
-
-    def get_username_tweet_id(self, url):
-        # detect URLs that we definitely cannot handle
-        matches = self.link_pattern.findall(url)
-        if not len(matches): return False, False
-
-        username, tweet_id = matches[0]  # only one URL supported
-        logger.debug(f"Found {username=} and {tweet_id=} in {url=}")
-
-        return username, tweet_id
-
-    def choose_variant(self, variants):
-        # choosing the highest quality possible
-        variant, width, height = None, 0, 0
-        for var in variants:
-            if var.get("content_type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
-                if width_height:
-                    w, h = int(width_height[1]), int(width_height[2])
-                    if w > width or h > height:
-                        width, height = w, h
-                        variant = var
-            else:
-                variant = var if not variant else variant
-        return variant
diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
index 97ad569..1bc8966 100644
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -1,221 +1,2 @@
-import datetime, os, yt_dlp, pysubs2
-from loguru import logger
-
-from . import Archiver
-from ..core import Metadata, Media, ArchivingContext
-
-
-class YoutubeDLArchiver(Archiver):
-    name = "youtubedl_archiver"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.subtitles = bool(self.subtitles)
-        self.comments = bool(self.comments)
-        self.livestreams = bool(self.livestreams)
-        self.live_from_start = bool(self.live_from_start)
-        self.end_means_success = bool(self.end_means_success)
-        self.allow_playlist = bool(self.allow_playlist)
-        self.max_downloads = self.max_downloads
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
-            "subtitles": {"default": True, "help": "download subtitles if available"},
-            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
-            "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
-            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
-            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
-            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
-            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
-            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
-            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
-            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
-        }
-    
-    def download_additional_media(self, ie: str, video_data: dict, metadata: Metadata) -> Metadata:
-        """
-        Downloads additional media like images, comments, subtitles, etc.
-
-        Creates a 'media' object and attaches it to the metadata object.
-        """
-
-        # TODO: should we download all thumbnails, or just the chosen thumbnail?
-
-        # Right now, just getting the single thumbnail
-        thumbnail_url = video_data.get('thumbnail')
-        if thumbnail_url:
-            try:
-                cover_image_path = self.download_from_url(thumbnail_url)
-                media = Media(cover_image_path)
-                metadata.add_media(media, id="cover")
-            except Exception as e:
-                logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
-
-        return metadata
-
-    def keys_to_clean(self, ie: str, video_data: dict) -> dict:
-        """
-        Clean up the video data to make it more readable and remove unnecessary keys that ytdlp adds
-        """
-
-        base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
-                     'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
-                     'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
-                     'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
-                     'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
-                     'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
-                     '_format_sort_fields', 'chapters', 'uploader_id', 'uploader_url', 'requested_formats', 'format_note',
-                     'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
-        if ie == 'TikTok':
-            return base_keys + []
-        
-        return base_keys
-    
-    def add_metadata(self, ie: str, video_data: dict, url:str, result: Metadata) -> Metadata:
-        """
-        Creates a Metadata object from the give video_data
-        """
-
-        # first add the media
-        result = self.download_additional_media(ie, video_data, result)
-
-        # keep the full title, no need for the shortened title (?)
-        video_data['title'] = video_data.pop('fulltitle', video_data.get('title'))
-        result.set_title(video_data.pop('title', url))
-
-        # then add the platform specific additional metadata
-        for key, mapping in self.video_data_metadata_mapping(ie, video_data).items():
-            if isinstance(mapping, str):
-                result.set(key, eval(f"video_data{mapping}"))
-            elif callable(mapping):
-                result.set(key, mapping(video_data))
-        result.set_url(url)
-
-        # extract comments if enabled
-        if self.comments:
-            result.set("comments", [{
-                "text": c["text"],
-                "author": c["author"], 
-                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
-            } for c in video_data.get("comments", [])])
-
-        # then add the common metadata
-        if (timestamp := video_data.pop("timestamp", None)):
-            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
-            result.set_timestamp(timestamp)
-        if (upload_date := video_data.pop("upload_date", None)):
-            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
-            result.set("upload_date", upload_date)
-        
-        # then clean away any keys we don't want
-        for clean_key in self.keys_to_clean(ie, video_data):
-            video_data.pop(clean_key, None)
-        
-        # then add the rest of the video data
-        for k, v in video_data.items():
-            if v:
-                result.set(k, v)
-
-        return result
-
-    def video_data_metadata_mapping(self, ie: str, video_data: dict) -> dict:
-        """
-        Returns a key->value mapping to map from the yt-dlp produced 'video_data' to the Metadata object.
-        Can be either a string for direct mapping, or a function, or a lambda.
-        """
-        return {}
-
-    def suitable(self, item: Metadata) -> bool:
-        """
-        Checks for valid URLs out of all ytdlp extractors.
-        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
-        """
-        url = item.get_url()
-        for ie_key, ie in yt_dlp.YoutubeDL()._ies.items():
-            # Note: this will return True for *all* URLs due to the 'generic' extractor from ytdlp (valid for all URLs).
-            # should we check for the 'GenericIE' extractor and return False?
-            # if ie.IE_NAME == 'generic'... - leaving it in for now, since we also want the ability to download from generic sites
-            # perhaps one solution is to return 'False' initially, and then if no other installed archivers work, we try again using the generic one
-            if ie.suitable(url) and ie.working():
-                return True
-        return False
-    
-    def download(self, item: Metadata) -> Metadata:
-        url = item.get_url()
-
-        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
-            logger.debug('Using Facebook cookie')
-            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
-        
-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
-
-        if item.netloc in ['youtube.com', 'www.youtube.com']:
-            if self.cookies_from_browser:
-                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
-                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
-            elif self.cookie_file:
-                logger.debug(f'Using cookies from file {self.cookie_file}')
-                ydl_options['cookiefile'] = self.cookie_file
-
-        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
-
-        try:
-            # don't download since it can be a live stream
-            info = ydl.extract_info(url, download=False)
-            if info.get('is_live', False) and not self.livestreams:
-                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
-                return False
-        except yt_dlp.utils.DownloadError as e:
-            logger.debug(f'No video - Youtube normal control flow: {e}')
-            return False
-        except Exception as e:
-            logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception is: \n  {e}')
-            return False
-
-        # this time download
-        ydl = yt_dlp.YoutubeDL({**ydl_options, "getcomments": self.comments})
-        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
-        info = ydl.extract_info(url, download=True)
-        if "entries" in info:
-            entries = info.get("entries", [])
-            if not len(entries):
-                logger.warning('YoutubeDLArchiver could not find any video')
-                return False
-        else: entries = [info]
-
-        ie = info['extractor_key']
-        result = Metadata()
-
-        for entry in entries:
-            try:
-                filename = ydl.prepare_filename(entry)
-                if not os.path.exists(filename):
-                    filename = filename.split('.')[0] + '.mkv'
-
-                new_media = Media(filename)
-                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
-                    if x in entry: new_media.set(x, entry[x])
-
-                # read text from subtitles if enabled
-                if self.subtitles:
-                    for lang, val in (info.get('requested_subtitles') or {}).items():
-                        try:    
-                            subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
-                            text = " ".join([line.text for line in subs])
-                            new_media.set(f"subtitles_{lang}", text)
-                        except Exception as e:
-                            logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
-                result.add_media(new_media)
-            except Exception as e:
-                logger.error(f"Error processing entry {entry}: {e}")
-
-        result = self.add_metadata(ie, info, url, result)
-        extractor_name = "yt-dlp"
-        if ie:
-            extractor_name += f"--{ie}IE"
-
-        if self.end_means_success: result.success(extractor_name)
-        else: result.status = extractor_name
-        return result
+# temporary hack, as we implement module
+from .youtubedl_archiver import *
diff --git a/tests/archivers/test_base_archiver.py b/tests/archivers/test_base_archiver.py
new file mode 100644
index 0000000..e58ca30
--- /dev/null
+++ b/tests/archivers/test_base_archiver.py
@@ -0,0 +1,141 @@
+import pytest
+from pathlib import Path
+import datetime 
+
+from auto_archiver.archivers.base_archiver import BaseArchiver
+
+from .test_archiver_base import TestArchiverBase
+
+class TestBaseArchiver(TestArchiverBase):
+    """Tests Base Archiver
+    """
+    archiver_class = BaseArchiver
+    config = {
+        'subtitles': False,
+        'comments': False,
+        'livestreams': False,
+        'live_from_start': False,
+        'end_means_success': True,
+        'allow_playlist': False,
+        'max_downloads': "inf",
+        'proxy': None,
+        'cookies_from_browser': False,
+        'cookie_file': None,
+        }
+
+    @pytest.mark.parametrize("url, is_suitable", [
+        ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
+        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
+        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
+        ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
+        ("https://www.twitch.tv/videos/1167226570", True),
+        ("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
+        ("https://google.com", True)])
+    def test_suitable_urls(self, make_item, url, is_suitable):
+        """
+            Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
+            This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
+            and then if and only if all archivers fails, does it fall back to the generic archiver)
+        """
+        assert self.archiver.suitable(url) == is_suitable
+
+    @pytest.mark.download
+    def test_download_tiktok(self, make_item):
+        item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
+        result = self.archiver.download(item)
+        assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
+    
+    @pytest.mark.download
+    def test_youtube_download(self, make_item):
+        # url https://www.youtube.com/watch?v=5qap5aO4i9A
+        item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
+        result = self.archiver.download(item)
+        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
+        assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
+        assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
+        assert len(result.media) == 2
+        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
+        assert Path(result.media[1].filename).name == "hqdefault.jpg"
+
+    @pytest.mark.download
+    def test_bluesky_download_multiple_images(self, make_item):
+        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
+        result = self.archiver.download(item)
+        assert result is not False
+
+    @pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
+    @pytest.mark.download
+    def test_bluesky_download_single_image(self, make_item):
+        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l")
+        result = self.archiver.download(item)
+        assert result is not False
+    
+    @pytest.mark.download
+    def test_bluesky_download_no_media(self, make_item):
+        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfphwmcs4c2z")
+        result = self.archiver.download(item)
+        assert result is not False
+
+    @pytest.mark.download
+    def test_bluesky_download_video(self, make_item):
+        item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
+        result = self.archiver.download(item)
+        assert result is not False
+
+    @pytest.mark.download
+    def test_twitter_download_nonexistend_tweet(self, make_item):
+        # this tweet does not exist
+        url = "https://x.com/Bellingcat/status/17197025860711058"
+        response = self.archiver.download(make_item(url))
+        assert not response
+    
+    @pytest.mark.download
+    def test_twitter_download_malformed_tweetid(self, make_item):
+        # this tweet does not exist
+        url = "https://x.com/Bellingcat/status/1719702a586071100058"
+        response = self.archiver.download(make_item(url))
+        assert not response
+
+    @pytest.mark.download
+    def test_twitter_download_tweet_no_media(self, make_item):
+        
+        item = make_item("https://twitter.com/MeCookieMonster/status/1617921633456640001?s=20&t=3d0g4ZQis7dCbSDg-mE7-w")
+        post = self.archiver.download(item)
+
+        self.assertValidResponseMetadata(
+            post,
+            "Onion rings are just vegetable donuts.",
+            datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
+            "yt-dlp_Twitter: success"
+        )
+    
+    @pytest.mark.download
+    def test_twitter_download_video(self, make_item):
+        url = "https://x.com/bellingcat/status/1871552600346415571"
+        post = self.archiver.download(make_item(url))
+        self.assertValidResponseMetadata(
+            post,
+            "Bellingcat - This month's Bellingchat Premium is with @KolinaKoltai. She reveals how she investigated a platform allowing users to create AI-generated child sexual abuse material and explains why it's crucial to investigate the people behind these services",
+            datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
+        )
+
+    @pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
+    @pytest.mark.download
+    @pytest.mark.parametrize("url, title, timestamp, image_hash", [
+            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+        ])
+    def test_twitter_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
+
+        """Download tweets with sensitive media"""
+
+        post = self.archiver.download(make_item(url))
+        self.assertValidResponseMetadata(
+            post,
+            title,
+            timestamp
+        )
+        assert len(post.media) == 1
+        assert post.media[0].hash == image_hash
\ No newline at end of file
diff --git a/tests/archivers/test_bluesky_archiver.py b/tests/archivers/test_bluesky_archiver.py
deleted file mode 100644
index c9e1c81..0000000
--- a/tests/archivers/test_bluesky_archiver.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import pytest
-
-from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver
-from .test_archiver_base import TestArchiverBase
-
-class TestBlueskyArchiver(TestArchiverBase):
-    """Tests Bluesky Archiver
-    
-    Note that these tests will download API responses from the bluesky API, so they may be slow.
-    This is an intended feature, as we want to test to ensure the bluesky API format hasn't changed, 
-    and also test the archiver's ability to download media.
-    """
-
-    archiver_class = BlueskyArchiver
-    config = {}
-
-    @pytest.mark.download
-    def test_download_media_with_images(self):
-        # url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y
-        post = self.archiver._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
-
-        # just make sure bsky haven't changed their format, images should be under "record/embed/media/images"
-        # there should be 2 images
-        assert "record" in post
-        assert "embed" in post["record"]
-        assert "media" in post["record"]["embed"]
-        assert "images" in post["record"]["embed"]["media"]
-        assert len(post["record"]["embed"]["media"]["images"]) == 2
-
-        # try downloading the media files
-        media = self.archiver._download_bsky_embeds(post)
-        assert len(media) == 2
-
-        # check the IDs
-        assert "bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')
-        assert "bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')
-
-    @pytest.mark.download
-    def test_download_post_with_single_image(self):
-        # url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l
-        post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l")
-
-        # just make sure bsky haven't changed their format, images should be under "record/embed/images"
-        # there should be 1 image
-        assert "record" in post
-        assert "embed" in post["record"]
-        assert "images" in post["record"]["embed"]
-        assert len(post["record"]["embed"]["images"]) == 1
-
-        media = self.archiver._download_bsky_embeds(post)
-        assert len(media) == 1
-
-        # check the ID 
-        assert "bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')
-                        
-
-    @pytest.mark.download
-    def test_download_post_with_video(self):
-        # url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i
-        post = self.archiver._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
-
-        # just make sure bsky haven't changed their format, video should be under "record/embed/video"
-        assert "record" in post
-        assert "embed" in post["record"]
-        assert "video" in post["record"]["embed"]
-
-        media = self.archiver._download_bsky_embeds(post)
-        assert len(media) == 1
-
-        # check the ID
-        assert "bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src')
-
-        
diff --git a/tests/archivers/test_tiktok_archiver.py b/tests/archivers/test_tiktok_archiver.py
deleted file mode 100644
index 8905c75..0000000
--- a/tests/archivers/test_tiktok_archiver.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import pytest
-
-from .test_archiver_base import TestArchiverBase
-from auto_archiver.archivers.tiktok_archiver import TiktokArchiver
-
-class TestBlueskyArchiver(TestArchiverBase):
-
-    archiver_class = TiktokArchiver
-    config = {}
-
-    @pytest.mark.xfail(reason="Tiktok API is not working")
-    @pytest.mark.download
-    def test_download_video(self, make_item):
-        # cat video
-        url = "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en"
-        item = self.archiver.download(make_item(url))
-        assert item.success
\ No newline at end of file
diff --git a/tests/archivers/test_twitter_archiver.py b/tests/archivers/test_twitter_api_archiver.py
similarity index 55%
rename from tests/archivers/test_twitter_archiver.py
rename to tests/archivers/test_twitter_api_archiver.py
index 17af2f2..fae1780 100644
--- a/tests/archivers/test_twitter_archiver.py
+++ b/tests/archivers/test_twitter_api_archiver.py
@@ -1,19 +1,31 @@
+import os
 import datetime
+
 import pytest
 
-from auto_archiver.archivers.twitter_archiver import TwitterArchiver
-
+from pytwitter.models.media import MediaVariant
 from .test_archiver_base import TestArchiverBase
+from auto_archiver.archivers import TwitterApiArchiver
 
-class TestTwitterArchiver(TestArchiverBase):
 
-    archiver_class = TwitterArchiver
-    config = {}
+@pytest.mark.incremental
+class TestTwitterApiArchiver(TestArchiverBase):
+
+    archiver_class = TwitterApiArchiver
+    config = {
+        "bearer_tokens": [],
+        "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN"),
+        "consumer_key": os.environ.get("TWITTER_CONSUMER_KEY"),
+        "consumer_secret": os.environ.get("TWITTER_CONSUMER_SECRET"),
+        "access_token": os.environ.get("TWITTER_ACCESS_TOKEN"),
+        "access_secret": os.environ.get("TWITTER_ACCESS_SECRET"),
+    }
+
     @pytest.mark.parametrize("url, expected", [
         ("https://t.co/yl3oOJatFp", "https://www.bellingcat.com/category/resources/"),  # t.co URL
         ("https://x.com/bellingcat/status/1874097816571961839", "https://x.com/bellingcat/status/1874097816571961839"), # x.com urls unchanged
         ("https://twitter.com/bellingcat/status/1874097816571961839", "https://twitter.com/bellingcat/status/1874097816571961839"), # twitter urls unchanged
-        ("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839"), # strip tracking params
+        ("https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://twitter.com/bellingcat/status/1874097816571961839?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # don't strip params from twitter urls (changed Jan 2025)
         ("https://www.bellingcat.com/category/resources/", "https://www.bellingcat.com/category/resources/"), # non-twitter/x urls unchanged
         ("https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w", "https://www.bellingcat.com/category/resources/?s=20&t=3d0g4ZQis7dCbSDg-mE7-w"), # shouldn't strip params from non-twitter/x URLs
     ])
@@ -25,64 +37,25 @@ class TestTwitterArchiver(TestArchiverBase):
         ("https://x.com/bellingcat/status/1874097816571961839", "bellingcat", "1874097816571961839"),
         ("https://www.bellingcat.com/category/resources/", False, False)
         ])
-
     def test_get_username_tweet_id_from_url(self, url, exptected_username, exptected_tweetid):
     
         username, tweet_id = self.archiver.get_username_tweet_id(url)
         assert exptected_username == username
         assert exptected_tweetid == tweet_id
-    
+
     def test_choose_variants(self):
         # taken from the response for url https://x.com/bellingcat/status/1871552600346415571
-        variant_list = [{'content_type': 'application/x-mpegURL', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'},
-                        {'bitrate': 256000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'},
-                        {'bitrate': 832000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'},
-                        {'bitrate': 2176000, 'content_type': 'video/mp4', 'url': 'https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12'}
+        variant_list = [MediaVariant(content_type='application/x-mpegURL', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/pl/ovWo7ux-bKROwYIC.m3u8?tag=12&v=e1b'),
+                        MediaVariant(bit_rate=256000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/480x270/OqZIrKV0LFswMvxS.mp4?tag=12'),
+                        MediaVariant(bit_rate=832000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/640x360/uiDZDSmZ8MZn9hsi.mp4?tag=12'),
+                        MediaVariant(bit_rate=2176000, content_type='video/mp4', url='https://video.twimg.com/ext_tw_video/1871551993677852672/pu/vid/avc1/1280x720/6Y340Esh568WZnRZ.mp4?tag=12')
                         ]
         chosen_variant = self.archiver.choose_variant(variant_list)
         assert chosen_variant == variant_list[3]
     
-    @pytest.mark.parametrize("tweet_id, expected_token", [
-        ("1874097816571961839", "4jjngwkifa"),
-        ("1674700676612386816", "42586mwa3uv"),
-        ("1877747914073620506", "4jv4aahw36n"),
-        ("1876710769913450647", "4jruzjz5lux"),
-        ("1346554693649113090", "39ibqxei7mo")
-        ])
-    def test_reverse_engineer_token(self, tweet_id, expected_token):
-        # see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
-        # and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
-
-        generated_token = self.archiver.generate_token(tweet_id)
-        assert expected_token == generated_token
 
     @pytest.mark.download
-    def test_youtube_dlp_archiver(self, make_item):
-
-        url = "https://x.com/bellingcat/status/1874097816571961839"
-        post = self.archiver.download_yt_dlp(make_item(url), url, "1874097816571961839")
-        assert post
-        self.assertValidResponseMetadata(
-            post,
-            "As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
-            datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc),
-            "twitter-ytdl"
-        )
-            
-    @pytest.mark.download
-    def test_syndication_archiver(self, make_item):
-
-        url = "https://x.com/bellingcat/status/1874097816571961839"
-        post = self.archiver.download_syndication(make_item(url), url, "1874097816571961839")
-        assert post
-        self.assertValidResponseMetadata(
-            post,
-            "As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
-            datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
-        )
-
-    @pytest.mark.download
-    def test_download_nonexistend_tweet(self, make_item):
+    def test_download_nonexistent_tweet(self, make_item):
         # this tweet does not exist
         url = "https://x.com/Bellingcat/status/17197025860711058"
         response = self.archiver.download(make_item(url))
@@ -105,9 +78,9 @@ class TestTwitterArchiver(TestArchiverBase):
             post,
             "Onion rings are just vegetable donuts.",
             datetime.datetime(2023, 1, 24, 16, 25, 51, tzinfo=datetime.timezone.utc),
-            "twitter-ytdl"
+            "twitter-api: success"
         )
-    
+
     @pytest.mark.download
     def test_download_video(self, make_item):
         url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -118,14 +91,13 @@ class TestTwitterArchiver(TestArchiverBase):
             datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
         )
 
-    @pytest.mark.xfail(reason="Currently failing, sensitive content requires logged in users/cookies - not yet implemented")
-    @pytest.mark.download
     @pytest.mark.parametrize("url, title, timestamp, image_hash", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
         ])
+    @pytest.mark.download
     def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
 
         """Download tweets with sensitive media"""
diff --git a/tests/archivers/test_youtubedl_archiver.py b/tests/archivers/test_youtubedl_archiver.py
deleted file mode 100644
index bb5a8d2..0000000
--- a/tests/archivers/test_youtubedl_archiver.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import pytest
-from pathlib import Path
-
-from auto_archiver.archivers.youtubedl_archiver import YoutubeDLArchiver
-
-from .test_archiver_base import TestArchiverBase
-
-class TestYoutubeDLArchiver(TestArchiverBase):
-    """Tests YoutubeDL Archiver
-    """
-    archiver_class = YoutubeDLArchiver
-    config = {
-        'subtitles': False,
-        'comments': False,
-        'livestreams': False,
-        'live_from_start': False,
-        'end_means_success': True,
-        'allow_playlist': False,
-        'max_downloads': "inf",
-        'proxy': None,
-        'cookies_from_browser': False,
-        'cookie_file': None,
-        }
-
-    @pytest.mark.parametrize("url, is_suitable", [
-        ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
-        ("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", True),
-        ("https://www.instagram.com/p/CU1J9JYJ9Zz/", True),
-        ("https://www.facebook.com/nytimes/videos/10160796550110716", True),
-        ("https://www.twitch.tv/videos/1167226570", True),
-        ("https://bellingcat.com/news/2021/10/08/ukrainian-soldiers-are-being-killed-by-landmines-in-the-donbas/", True),
-        ("https://google.com", True)])
-    def test_suitable_urls(self, make_item, url, is_suitable):
-        """
-            Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
-            This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
-            and then if and only if all archivers fails, does it fall back to the generic archiver)
-        """
-        assert self.archiver.suitable(make_item(url)) == is_suitable
-
-    @pytest.mark.download
-    def test_download_tiktok(self, make_item):
-        item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
-        result = self.archiver.download(item)
-        assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
-    
-    @pytest.mark.download
-    def test_download_youtube(self, make_item):
-        # url https://www.youtube.com/watch?v=5qap5aO4i9A
-        item = make_item("https://www.youtube.com/watch?v=J---aiyznGQ")
-        result = self.archiver.download(item)
-        assert result.get_url() == "https://www.youtube.com/watch?v=J---aiyznGQ"
-        assert result.get_title() == "Keyboard Cat! - THE ORIGINAL!"
-        assert result.get('description') == "Buy NEW Keyboard Cat Merch! https://keyboardcat.creator-spring.com\n\nxo Keyboard Cat memes make your day better!\nhttp://www.keyboardcatstore.com/\nhttps://www.facebook.com/thekeyboardcat\nhttp://www.charlieschmidt.com/"
-        assert len(result.media) == 2
-        assert Path(result.media[0].filename).name == "J---aiyznGQ.webm"
-        assert Path(result.media[1].filename).name == "hqdefault.jpg"
-
-    @pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
-    @pytest.mark.download
-    def test_download_bluesky_with_images(self, make_item):
-        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
-        result = self.archiver.download(item)
-        assert result is not False
-
-    @pytest.mark.skip("ytdlp supports twitter, but there's currently no way to extract info from pages without videos")
-    @pytest.mark.download
-    def test_download_twitter_textonly(self, make_item):
-        item = make_item("https://x.com/bellingcat/status/1874097816571961839")
-        result = self.archiver.download(item)
-        assert result is not False
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 87d4ac0..553b573 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,19 @@
+"""
+pytest conftest file, for shared fixtures and configuration
+"""
+
+from typing import Dict, Tuple
+
 import pytest
 from auto_archiver.core.metadata import Metadata
 
+# Test names inserted into this list will be run last. This is useful for expensive/costly tests
+# that you only want to run if everything else succeeds (e.g. API calls). The order here is important
+# what comes first will be run first (at the end of all other tests not mentioned)
+# format is the name of the module (python file) without the .py extension
+TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
+
+
 @pytest.fixture
 def make_item():
     def _make_item(url: str, **kwargs) -> Metadata:
@@ -9,4 +22,61 @@ def make_item():
             item.set(key, value)
         return item
 
-    return _make_item
\ No newline at end of file
+    return _make_item
+
+
+
+def pytest_collection_modifyitems(items):
+    module_mapping = {item: item.module.__name__.split(".")[-1] for item in items}
+
+    sorted_items = items.copy()
+    # Iteratively move tests of each module to the end of the test queue
+    for module in TESTS_TO_RUN_LAST:
+        if module in module_mapping.values():
+            for item in sorted_items:
+                if module_mapping[item] == module:
+                    sorted_items.remove(item)
+                    sorted_items.append(item)
+
+    items[:] = sorted_items
+
+
+
+# Incremental testing - fail tests in a class if any previous test fails
+# taken from https://docs.pytest.org/en/latest/example/simple.html#incremental-testing-test-steps
+
+# store history of failures per test class name and per index in parametrize (if parametrize used)
+_test_failed_incremental: Dict[str, Dict[Tuple[int, ...], str]] = {}
+
+def pytest_runtest_makereport(item, call):
+    if "incremental" in item.keywords:
+        # incremental marker is used
+        if call.excinfo is not None:
+            # the test has failed
+            # retrieve the class name of the test
+            cls_name = str(item.cls)
+            # retrieve the index of the test (if parametrize is used in combination with incremental)
+            parametrize_index = (
+                tuple(item.callspec.indices.values())
+                if hasattr(item, "callspec")
+                else ()
+            )
+            # retrieve the name of the test function
+            test_name = item.originalname or item.name
+            # store in _test_failed_incremental the original name of the failed test
+            _test_failed_incremental.setdefault(cls_name, {}).setdefault(
+                parametrize_index, test_name
+            )
+
+
+def pytest_runtest_setup(item):
+    if "incremental" in item.keywords:
+        # retrieve the class name of the test
+        cls_name = str(item.cls)
+        # check if a previous test has failed for this class
+        if cls_name in _test_failed_incremental:
+            # retrieve the name of the first test function to fail for this class name and index
+            test_name = _test_failed_incremental[cls_name].get((), None)
+            # if name found, test has failed for the combination of class name & test name
+            if test_name is not None:
+                pytest.xfail(f"previous test failed ({test_name})")
\ No newline at end of file

From 17c1c9c3605e5b639efb2499c6edcf22847c1c23 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 12:02:38 +0100
Subject: [PATCH 07/20] Fix up core unit tests when a twitter api key isn't
 provided

---
 pyproject.toml                               | 1 +
 tests/archivers/test_twitter_api_archiver.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 995024a..0cd5f8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,4 +74,5 @@ documentation = "https://github.com/bellingcat/auto-archiver"
 [tool.pytest.ini_options]
 markers = [
     "download: marks tests that download content from the network",
+    "incremental: marks a class to run tests incrementally. If a test fails in the class, the remaining tests will be skipped",
 ]
\ No newline at end of file
diff --git a/tests/archivers/test_twitter_api_archiver.py b/tests/archivers/test_twitter_api_archiver.py
index fae1780..c8009f1 100644
--- a/tests/archivers/test_twitter_api_archiver.py
+++ b/tests/archivers/test_twitter_api_archiver.py
@@ -14,7 +14,7 @@ class TestTwitterApiArchiver(TestArchiverBase):
     archiver_class = TwitterApiArchiver
     config = {
         "bearer_tokens": [],
-        "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN"),
+        "bearer_token": os.environ.get("TWITTER_BEARER_TOKEN", "TEST_KEY"),
         "consumer_key": os.environ.get("TWITTER_CONSUMER_KEY"),
         "consumer_secret": os.environ.get("TWITTER_CONSUMER_SECRET"),
         "access_token": os.environ.get("TWITTER_ACCESS_TOKEN"),

From 59eb8f752081208757743e0f622bd50682dd5d97 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 12:04:40 +0100
Subject: [PATCH 08/20] Add TWITTER_BEARER_TOKEN to env for running download
 tests

---
 .github/workflows/tests-download.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/tests-download.yaml b/.github/workflows/tests-download.yaml
index fc31f42..a68231f 100644
--- a/.github/workflows/tests-download.yaml
+++ b/.github/workflows/tests-download.yaml
@@ -36,3 +36,5 @@ jobs:
 
       - name: Run Download Tests
         run: poetry run pytest -ra -v -x -m "download"
+        env:
+          TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN }}

From 5b20288d06a86e6a5e17eb2a6b0fc74d7a39cb33 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 16:59:57 +0100
Subject: [PATCH 09/20] Add a 'version' arg to get the current running version

---
 src/auto_archiver/core/config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py
index 9bce88f..380319c 100644
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@@ -1,6 +1,6 @@
-
-
-import argparse, yaml
+import importlib
+import argparse
+import yaml
 from dataclasses import dataclass, field
 from typing import List
 from collections import defaultdict
@@ -54,6 +54,7 @@ class Config:
             )
 
             parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
+            parser.add_argument('--version', action='version', version=importlib.metadata.version('auto_archiver'))
 
         for configurable in self.configurable_parents:
             child: Step

From 5aa717452e077d4287aad76553b7e1161a9f09e0 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 17:02:54 +0100
Subject: [PATCH 10/20] Quick test that the app actually runs in core tests

---
 .github/workflows/tests-core.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests-core.yaml b/.github/workflows/tests-core.yaml
index 06c67cb..08219f9 100644
--- a/.github/workflows/tests-core.yaml
+++ b/.github/workflows/tests-core.yaml
@@ -35,4 +35,6 @@ jobs:
         run: poetry install --no-interaction --with dev
 
       - name: Run Core Tests
-        run: poetry run pytest -ra -v -m "not download"
+        run: |
+          poetry run auto-archiver --version || true
+          poetry run pytest -ra -v -m "not download"

From 9c5a9e1bcd440be197b70576483e38b03318493c Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 17:06:04 +0100
Subject: [PATCH 11/20] Rename BaseArchiver to GenericArchiver + some other
 tidyups

---
 src/auto_archiver/archivers/__init__.py                      | 2 +-
 src/auto_archiver/archivers/base_archiver/__init__.py        | 1 -
 src/auto_archiver/archivers/generic_archiver/__init__.py     | 1 +
 .../archivers/{base_archiver => generic_archiver}/bluesky.py | 0
 .../generic_archiver.py}                                     | 5 +++--
 .../archivers/{base_archiver => generic_archiver}/twitter.py | 0
 src/auto_archiver/archivers/youtubedl_archiver.py            | 2 +-
 7 files changed, 6 insertions(+), 5 deletions(-)
 delete mode 100644 src/auto_archiver/archivers/base_archiver/__init__.py
 create mode 100644 src/auto_archiver/archivers/generic_archiver/__init__.py
 rename src/auto_archiver/archivers/{base_archiver => generic_archiver}/bluesky.py (100%)
 rename src/auto_archiver/archivers/{base_archiver/base_archiver.py => generic_archiver/generic_archiver.py} (99%)
 rename src/auto_archiver/archivers/{base_archiver => generic_archiver}/twitter.py (100%)

diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
index 24dde91..dc28269 100644
--- a/src/auto_archiver/archivers/__init__.py
+++ b/src/auto_archiver/archivers/__init__.py
@@ -5,5 +5,5 @@ from .instagram_archiver import InstagramArchiver
 from .instagram_tbot_archiver import InstagramTbotArchiver
 from .telegram_archiver import TelegramArchiver
 from .vk_archiver import VkArchiver
-from .base_archiver.base_archiver import BaseArchiver as YoutubeDLArchiver
+from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver
 from .instagram_api_archiver import InstagramAPIArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/base_archiver/__init__.py b/src/auto_archiver/archivers/base_archiver/__init__.py
deleted file mode 100644
index 15ee4eb..0000000
--- a/src/auto_archiver/archivers/base_archiver/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .base_archiver import BaseArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/generic_archiver/__init__.py b/src/auto_archiver/archivers/generic_archiver/__init__.py
new file mode 100644
index 0000000..0788ae0
--- /dev/null
+++ b/src/auto_archiver/archivers/generic_archiver/__init__.py
@@ -0,0 +1 @@
+from .generic_archiver import GenericArchiver
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/base_archiver/bluesky.py b/src/auto_archiver/archivers/generic_archiver/bluesky.py
similarity index 100%
rename from src/auto_archiver/archivers/base_archiver/bluesky.py
rename to src/auto_archiver/archivers/generic_archiver/bluesky.py
diff --git a/src/auto_archiver/archivers/base_archiver/base_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
similarity index 99%
rename from src/auto_archiver/archivers/base_archiver/base_archiver.py
rename to src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index b1cbabd..573f47f 100644
--- a/src/auto_archiver/archivers/base_archiver/base_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -9,7 +9,7 @@ from auto_archiver.archivers.archiver import Archiver
 from ...core import Metadata, Media, ArchivingContext
 
 
-class BaseArchiver(Archiver):
+class GenericArchiver(Archiver):
     name = "youtubedl_archiver" #left as is for backwards compat
 
     def __init__(self, config: dict) -> None:
@@ -76,7 +76,8 @@ class BaseArchiver(Archiver):
             return base_keys + [] 
         elif extractor_key == "Bluesky":
             # bluesky API response for non video URLs is already clean, nothing to add
-            return base_keys + [] 
+            return base_keys + []
+        
         
         return base_keys
     
diff --git a/src/auto_archiver/archivers/base_archiver/twitter.py b/src/auto_archiver/archivers/generic_archiver/twitter.py
similarity index 100%
rename from src/auto_archiver/archivers/base_archiver/twitter.py
rename to src/auto_archiver/archivers/generic_archiver/twitter.py
diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
index 1bc8966..8b61974 100644
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -1,2 +1,2 @@
 # temporary hack, as we implement module
-from .youtubedl_archiver import *
+from .generic_archiver.generic_archiver import GenericArchiver as YoutubeDLArchiver

From d4893ee05ebf3ee4e433127cf4a7ff7cd194c220 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 17:08:00 +0100
Subject: [PATCH 12/20] Fix unit tests for base_archiver->generic_archiver
 rename

---
 tests/archivers/test_base_archiver.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/archivers/test_base_archiver.py b/tests/archivers/test_base_archiver.py
index e58ca30..03a7a9b 100644
--- a/tests/archivers/test_base_archiver.py
+++ b/tests/archivers/test_base_archiver.py
@@ -2,14 +2,14 @@ import pytest
 from pathlib import Path
 import datetime 
 
-from auto_archiver.archivers.base_archiver import BaseArchiver
+from auto_archiver.archivers.generic_archiver import GenericArchiver
 
 from .test_archiver_base import TestArchiverBase
 
-class TestBaseArchiver(TestArchiverBase):
+class TestGenericArchiver(TestArchiverBase):
     """Tests Base Archiver
     """
-    archiver_class = BaseArchiver
+    archiver_class = GenericArchiver
     config = {
         'subtitles': False,
         'comments': False,

From befc92deb46651d6c86e9c45b44cf010acb19c82 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Fri, 17 Jan 2025 17:29:13 +0100
Subject: [PATCH 13/20] Further unit test tidy ups

---
 ...e_archiver.py => test_generic_archiver.py} |  3 ++-
 tests/archivers/test_twitter_api_archiver.py  | 22 +++++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)
 rename tests/archivers/{test_base_archiver.py => test_generic_archiver.py} (99%)

diff --git a/tests/archivers/test_base_archiver.py b/tests/archivers/test_generic_archiver.py
similarity index 99%
rename from tests/archivers/test_base_archiver.py
rename to tests/archivers/test_generic_archiver.py
index 03a7a9b..b6f460e 100644
--- a/tests/archivers/test_base_archiver.py
+++ b/tests/archivers/test_generic_archiver.py
@@ -1,6 +1,7 @@
 import pytest
 from pathlib import Path
 import datetime 
+import os
 
 from auto_archiver.archivers.generic_archiver import GenericArchiver
 
@@ -44,7 +45,7 @@ class TestGenericArchiver(TestArchiverBase):
         item = make_item("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970")
         result = self.archiver.download(item)
         assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
-    
+
     @pytest.mark.download
     def test_youtube_download(self, make_item):
         # url https://www.youtube.com/watch?v=5qap5aO4i9A
diff --git a/tests/archivers/test_twitter_api_archiver.py b/tests/archivers/test_twitter_api_archiver.py
index c8009f1..a95f2c7 100644
--- a/tests/archivers/test_twitter_api_archiver.py
+++ b/tests/archivers/test_twitter_api_archiver.py
@@ -53,14 +53,15 @@ class TestTwitterApiArchiver(TestArchiverBase):
         chosen_variant = self.archiver.choose_variant(variant_list)
         assert chosen_variant == variant_list[3]
     
-
+    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
     @pytest.mark.download
     def test_download_nonexistent_tweet(self, make_item):
         # this tweet does not exist
         url = "https://x.com/Bellingcat/status/17197025860711058"
         response = self.archiver.download(make_item(url))
         assert not response
-    
+
+    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
     @pytest.mark.download
     def test_download_malformed_tweetid(self, make_item):
         # this tweet does not exist
@@ -68,6 +69,7 @@ class TestTwitterApiArchiver(TestArchiverBase):
         response = self.archiver.download(make_item(url))
         assert not response
 
+    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
     @pytest.mark.download
     def test_download_tweet_no_media(self, make_item):
         
@@ -81,6 +83,7 @@ class TestTwitterApiArchiver(TestArchiverBase):
             "twitter-api: success"
         )
 
+    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
     @pytest.mark.download
     def test_download_video(self, make_item):
         url = "https://x.com/bellingcat/status/1871552600346415571"
@@ -91,14 +94,15 @@ class TestTwitterApiArchiver(TestArchiverBase):
             datetime.datetime(2024, 12, 24, 13, 44, 46, tzinfo=datetime.timezone.utc)
         )
 
-    @pytest.mark.parametrize("url, title, timestamp, image_hash", [
-            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
-            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "image_hash"),
+    @pytest.mark.skipif(not os.environ.get("TWITTER_BEARER_TOKEN"), reason="No Twitter bearer token provided")
+    @pytest.mark.parametrize("url, title, timestamp, image_src", [
+            ("https://x.com/SozinhoRamalho/status/1876710769913450647", "ignore tweet, testing sensitivity warning nudity https://t.co/t3u0hQsSB1", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"),
+            ("https://x.com/SozinhoRamalho/status/1876710875475681357", "ignore tweet, testing sensitivity warning violence https://t.co/syYDSkpjZD", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"),
+            ("https://x.com/SozinhoRamalho/status/1876711053813227618", "ignore tweet, testing sensitivity warning sensitive https://t.co/XE7cRdjzYq", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"),
+            ("https://x.com/SozinhoRamalho/status/1876711141314801937", "ignore tweet, testing sensitivity warning nudity, violence, sensitivity https://t.co/YxCFbbhYE3", datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc), "https://pbs.twimg.com/media/GgtqkomWkAAHUUl.jpg"),
         ])
     @pytest.mark.download
-    def test_download_sensitive_media(self, url, title, timestamp, image_hash, make_item):
+    def test_download_sensitive_media(self, url, title, timestamp, image_src, make_item):
 
         """Download tweets with sensitive media"""
 
@@ -109,4 +113,4 @@ class TestTwitterApiArchiver(TestArchiverBase):
             timestamp
         )
         assert len(post.media) == 1
-        assert post.media[0].hash == image_hash
\ No newline at end of file
+        assert post.media[0].get('src') == image_src
\ No newline at end of file

From fd2e7f973b75d7bc5fe8cf8956464da283a34ff3 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Mon, 20 Jan 2025 16:17:57 +0100
Subject: [PATCH 14/20] Further tidy-ups, also adds some ytdlp utils to 'utils'

---
 .../archivers/generic_archiver/bluesky.py     | 27 +++------
 .../generic_archiver/generic_archiver.py      | 55 +++++++++----------
 .../archivers/generic_archiver/truth.py       | 31 +++++++++++
 src/auto_archiver/utils/__init__.py           |  5 +-
 tests/archivers/test_archiver_base.py         |  8 +--
 tests/archivers/test_generic_archiver.py      | 55 ++++++++++++++++++-
 6 files changed, 123 insertions(+), 58 deletions(-)
 create mode 100644 src/auto_archiver/archivers/generic_archiver/truth.py

diff --git a/src/auto_archiver/archivers/generic_archiver/bluesky.py b/src/auto_archiver/archivers/generic_archiver/bluesky.py
index 176808b..684124b 100644
--- a/src/auto_archiver/archivers/generic_archiver/bluesky.py
+++ b/src/auto_archiver/archivers/generic_archiver/bluesky.py
@@ -18,13 +18,13 @@ def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
         if v: result.set(k, v)
 
     # download if embeds present (1 video XOR >=1 images)
-    for media in _download_bsky_embeds(post):
+    for media in _download_bsky_embeds(post, archiver):
         result.add_media(media)
     logger.debug(f"Downloaded {len(result.media)} media files")
 
     return result
 
-def _download_bsky_embeds(post: dict) -> list[Media]:
+def _download_bsky_embeds(post: dict, archiver: Archiver) -> list[Media]:
     """
     Iterates over image(s) or video in a Bluesky post and downloads them        
     """
@@ -33,30 +33,17 @@ def _download_bsky_embeds(post: dict) -> list[Media]:
     image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
     video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
 
+    media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
     for image_media in image_medias:
-        image_media = _download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"])
+        url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
+        image_media = archiver.download_from_url(url)
         media.append(image_media)
     for video_media in video_medias:
-        video_media = _download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"])
+        url = media_url.format(video_media['ref']['$link'], post['author']['did'])
+        video_media = archiver.download_from_url(url)
         media.append(video_media)
     return media
 
-def _download_bsky_file_as_media(cid: str, did: str) -> Media:
-    """
-    Uses the Bluesky API to download a file by its `cid` and `did`.
-    """
-    # TODO: replace with self.download_from_url once that function has been cleaned-up
-    file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}"
-    response = requests.get(file_url, stream=True)
-    response.raise_for_status()
-    ext = mimetypes.guess_extension(response.headers["Content-Type"])
-    filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}")
-    with open(filename, "wb") as f:
-        for chunk in response.iter_content(chunk_size=8192):
-            f.write(chunk)
-    media = Media(filename=filename)
-    media.set("src", file_url)
-    return media
 
 def _get_post_data(post: dict) -> dict:
     """
diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index 573f47f..00119f7 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -4,7 +4,7 @@ from yt_dlp.extractor.common import InfoExtractor
 
 from loguru import logger
 
-from . import bluesky, twitter
+from . import bluesky, twitter, truth
 from auto_archiver.archivers.archiver import Archiver
 from ...core import Metadata, Media, ArchivingContext
 
@@ -91,13 +91,6 @@ class GenericArchiver(Archiver):
 
         # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
         result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
-
-        # then add the platform specific additional metadata
-        for key, mapping in self.video_data_metadata_mapping(extractor_key, video_data).items():
-            if isinstance(mapping, str):
-                result.set(key, eval(f"video_data{mapping}"))
-            elif callable(mapping):
-                result.set(key, mapping(video_data))
         result.set_url(url)
 
         # extract comments if enabled
@@ -126,13 +119,6 @@ class GenericArchiver(Archiver):
                 result.set(k, v)
 
         return result
-
-    def video_data_metadata_mapping(self, extractor_key: str, video_data: dict) -> dict:
-        """
-        Returns a key->value mapping to map from the yt-dlp produced 'video_data' to the Metadata object.
-        Can be either a string for direct mapping, or a function, or a lambda.
-        """
-        return {}
     
     def suitable_extractors(self, url: str) -> list[str]:
         """
@@ -148,14 +134,20 @@ class GenericArchiver(Archiver):
         """
         return any(self.suitable_extractors(url))
 
-    def create_metadata_for_post(self, info_extractor: InfoExtractor, video_data: dict, url: str) -> Metadata:
+    def create_metadata_for_post(self, info_extractor: InfoExtractor, post_data: dict, url: str) -> Metadata:
         """
-        Standardizes the output of the ytdlp InfoExtractor to a common format
+        Standardizes the output of the 'post' data from a ytdlp InfoExtractor to Metadata object.
+
+        This is only required for platforms that don't have videos, and therefore cannot be converted into ytdlp valid 'video_data'.
+        In these instances, we need to use the extractor's _extract_post (or similar) method to get the post metadata, and then convert
+        it into a Metadata object via a platform-specific function.
         """
         if info_extractor.ie_key() == 'Bluesky':
-            return bluesky.create_metadata(video_data, self, url)
+            return bluesky.create_metadata(post_data, self, url)
         if info_extractor.ie_key() == 'Twitter':
-            return twitter.create_metadata(video_data, self, url)
+            return twitter.create_metadata(post_data, self, url)
+        if info_extractor.ie_key() == 'Truth':
+            return truth.create_metadata(post_data, self, url)
 
     def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
         """
@@ -174,23 +166,22 @@ class GenericArchiver(Archiver):
             twid = ie_instance._match_valid_url(url).group('id')
             # TODO: if ytdlp PR https://github.com/yt-dlp/yt-dlp/pull/12098 is merged, change to _extract_post
             post_data = ie_instance._extract_status(twid=twid)
-
-        elif info_extractor.ie_key() == 'TikTok':
-            pass
-
+        elif info_extractor.ie_key() == 'Truth':
+            video_id = ie_instance._match_id(url)
+            truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
+            post_data = ie_instance._download_json(truthsocial_url, video_id)
         else:
             # lame attempt at trying to get data for an unknown extractor
             # TODO: test some more video platforms and see if there's any improvement to be made
             try:
                 post_data = ie_instance._extract_post(url)
             except (NotImplementedError, AttributeError) as e:
-                logger.debug(f"Extractor {info_extractor.ie_key()} does not support extracting post info: {e}")
+                logger.debug(f"Extractor {info_extractor.ie_key()} does not support extracting post info from non-video URLs: {e}")
                 return False
 
         return self.create_metadata_for_post(ie_instance, post_data, url)
         
     def get_metatdata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
-
         # this time download
         ydl.params['getcomments'] = self.comments
         #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
@@ -250,12 +241,16 @@ class GenericArchiver(Archiver):
             # it's a valid video, that the youtubdedl can download out of the box
             result = self.get_metatdata_for_video(info, info_extractor, url, ydl)
 
-        except yt_dlp.utils.DownloadError as e:
-            logger.debug(f'No video found, attempting to use extractor directly: {e}')
-            result = self.get_metatdata_for_post(info_extractor, url, ydl)
         except Exception as e:
-            logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception is: \n  {e}')
-            return False
+            logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
+            try:
+                result = self.get_metatdata_for_post(info_extractor, url, ydl)
+            except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
+                logger.error(f'Error downloading metadata for post: {post_e}')
+                return False
+            except Exception as generic_e:
+                logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
+                return False
         
         if result:
             extractor_name = "yt-dlp"
diff --git a/src/auto_archiver/archivers/generic_archiver/truth.py b/src/auto_archiver/archivers/generic_archiver/truth.py
new file mode 100644
index 0000000..780a56e
--- /dev/null
+++ b/src/auto_archiver/archivers/generic_archiver/truth.py
@@ -0,0 +1,31 @@
+import datetime
+
+from auto_archiver.utils import clean_html, traverse_obj
+from auto_archiver.core.metadata import Metadata
+from auto_archiver.archivers.archiver import Archiver
+
+def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
+    """
+    Creates metaata from a truth social post
+    
+    Only used for posts that contains no media. ytdlp.TruthIE extractor can handle posts with media
+    
+    Format is:
+    
+    {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
+    """
+    result = Metadata()
+    result.set_url(url)
+    timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
+    result.set_timestamp(datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ"))
+    result.set('description', post['content'])
+    result.set('author', post['account']['username'])
+
+    for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
+        if isinstance(key, tuple):
+            store_key = u" ".join(key)
+        else:
+            store_key = key
+        result.set(store_key, traverse_obj(post, key))
+
+    return result
\ No newline at end of file
diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py
index fe5cb58..50bddca 100644
--- a/src/auto_archiver/utils/__init__.py
+++ b/src/auto_archiver/utils/__init__.py
@@ -4,4 +4,7 @@ from .misc import *
 from .webdriver import Webdriver
 from .gsheet import Gsheets
 from .url import UrlUtil
-from .atlos import get_atlos_config_options
\ No newline at end of file
+from .atlos import get_atlos_config_options
+
+# handy utils from ytdlp
+from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none)
\ No newline at end of file
diff --git a/tests/archivers/test_archiver_base.py b/tests/archivers/test_archiver_base.py
index ed77739..d793706 100644
--- a/tests/archivers/test_archiver_base.py
+++ b/tests/archivers/test_archiver_base.py
@@ -3,17 +3,17 @@ import pytest
 from auto_archiver.core import Metadata
 from auto_archiver.core import Step
 from auto_archiver.core.metadata import Metadata
-
+from auto_archiver.archivers.archiver import Archiver
 class TestArchiverBase(object):
 
-    archiver_class = None
-    config = None
+    archiver_class: str = None
+    config: dict = None
 
     @pytest.fixture(autouse=True)
     def setup_archiver(self):
         assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
         assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.archiver = self.archiver_class({self.archiver_class.name: self.config})
+        self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config})
     
     def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
         assert test_response is not False
diff --git a/tests/archivers/test_generic_archiver.py b/tests/archivers/test_generic_archiver.py
index b6f460e..a35d28d 100644
--- a/tests/archivers/test_generic_archiver.py
+++ b/tests/archivers/test_generic_archiver.py
@@ -46,6 +46,23 @@ class TestGenericArchiver(TestArchiverBase):
         result = self.archiver.download(item)
         assert result.get_url() == "https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970"
 
+    @pytest.mark.download
+    @pytest.mark.parametrize("url", [
+        "https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l",
+        "twitter.com/bellingcat/status/123",
+        "https://www.youtube.com/watch?v=1"
+    ])
+    def test_download_nonexistend_media(self, make_item, url):
+        """
+        Test to make sure that the extractor doesn't break on non-existend posts/media
+
+        It should return 'False'
+        """
+        item = make_item(url)
+        result = self.archiver.download(item)
+        assert not result
+
+
     @pytest.mark.download
     def test_youtube_download(self, make_item):
         # url https://www.youtube.com/watch?v=5qap5aO4i9A
@@ -60,14 +77,13 @@ class TestGenericArchiver(TestArchiverBase):
 
     @pytest.mark.download
     def test_bluesky_download_multiple_images(self, make_item):
-        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y")
+        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lffjoxcu7k2w")
         result = self.archiver.download(item)
         assert result is not False
 
-    @pytest.mark.skip("ytdlp supports bluesky, but there's currently no way to extract info from pages without videos")
     @pytest.mark.download
     def test_bluesky_download_single_image(self, make_item):
-        item = make_item("https://bsky.app/profile/colborne.bsky.social/post/3lcxcpgt6j42l")
+        item = make_item("https://bsky.app/profile/bellingcat.com/post/3lfn3hbcxgc2q")
         result = self.archiver.download(item)
         assert result is not False
     
@@ -82,6 +98,39 @@ class TestGenericArchiver(TestArchiverBase):
         item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
         result = self.archiver.download(item)
         assert result is not False
+    
+    @pytest.mark.download
+    def test_truthsocial_download_video(self, make_item):
+        item = make_item("https://truthsocial.com/@DaynaTrueman/posts/110602446619561579")
+        result = self.archiver.download(item)
+        assert len(result.media) == 1
+        assert result is not False
+
+    @pytest.mark.download
+    def test_truthsocial_download_no_media(self, make_item):
+        item = make_item("https://truthsocial.com/@bbcnewa/posts/109598702184774628")
+        result = self.archiver.download(item)
+        assert result is not False
+    
+    @pytest.mark.download
+    def test_truthsocial_download_poll(self, make_item):
+        item = make_item("https://truthsocial.com/@CNN_US/posts/113724326568555098")
+        result = self.archiver.download(item)
+        assert result is not False
+    
+    @pytest.mark.download
+    def test_truthsocial_download_single_image(self, make_item):
+        item = make_item("https://truthsocial.com/@mariabartiromo/posts/113861116433335006")
+        result = self.archiver.download(item)
+        assert len(result.media) == 1
+        assert result is not False
+
+    @pytest.mark.skip("Currently failing, multiple images are not being downloaded - this is due to an issue with ytdlp extractor")
+    @pytest.mark.download
+    def test_truthsocial_download_multiple_images(self, make_item):
+        item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")
+        result = self.archiver.download(item)
+        assert len(result.media) == 3
 
     @pytest.mark.download
     def test_twitter_download_nonexistend_tweet(self, make_item):

From dff01056594e0c145ed73d823eddb21e1bbcfff9 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Mon, 20 Jan 2025 18:40:46 +0100
Subject: [PATCH 15/20] Small fixups + implement Truth code for posts with
 multiple media

---
 .../generic_archiver/generic_archiver.py      | 10 ++++++++--
 .../archivers/generic_archiver/truth.py       | 20 +++++++++++++------
 tests/archivers/test_generic_archiver.py      |  1 -
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index 00119f7..41f1314 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -181,7 +181,8 @@ class GenericArchiver(Archiver):
 
         return self.create_metadata_for_post(ie_instance, post_data, url)
         
-    def get_metatdata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+    def get_metadata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+
         # this time download
         ydl.params['getcomments'] = self.comments
         #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
@@ -233,13 +234,18 @@ class GenericArchiver(Archiver):
         result = False
 
         try:
+            if info_extractor.ie_key() == "Truth":
+                # the ytdlp truth extractor currently only gets the first image/video in the 'media' section, as opposed to all of them
+                # we don't want this
+                raise yt_dlp.utils.ExtractorError("Use the 'post data' method for Truth posts")
+
             # don't download since it can be a live stream
             info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
             if info.get('is_live', False) and not self.livestreams:
                 logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
                 return False
             # it's a valid video, that the youtubdedl can download out of the box
-            result = self.get_metatdata_for_video(info, info_extractor, url, ydl)
+            result = self.get_metadata_for_video(info, info_extractor, url, ydl)
 
         except Exception as e:
             logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
diff --git a/src/auto_archiver/archivers/generic_archiver/truth.py b/src/auto_archiver/archivers/generic_archiver/truth.py
index 780a56e..00551f3 100644
--- a/src/auto_archiver/archivers/generic_archiver/truth.py
+++ b/src/auto_archiver/archivers/generic_archiver/truth.py
@@ -1,31 +1,39 @@
 import datetime
 
-from auto_archiver.utils import clean_html, traverse_obj
-from auto_archiver.core.metadata import Metadata
+from auto_archiver.utils import traverse_obj
+from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.archivers.archiver import Archiver
 
+from dateutil.parser import parse as parse_dt
+
 def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
     """
-    Creates metaata from a truth social post
+    Creates metadata from a truth social post
     
-    Only used for posts that contains no media. ytdlp.TruthIE extractor can handle posts with media
+    Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
     
     Format is:
     
     {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
     """
+    breakpoint()
     result = Metadata()
     result.set_url(url)
     timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
-    result.set_timestamp(datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ"))
+    result.set_timestamp(parse_dt(timestamp))
     result.set('description', post['content'])
     result.set('author', post['account']['username'])
 
     for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
         if isinstance(key, tuple):
-            store_key = u" ".join(key)
+            store_key = " ".join(key)
         else:
             store_key = key
         result.set(store_key, traverse_obj(post, key))
+    
+    # add the media
+    for media in post.get('media_attachments', []):
+        filename = archiver.download_from_url(media['url'])
+        result.add_media(Media(filename), id=media.get('id'))
 
     return result
\ No newline at end of file
diff --git a/tests/archivers/test_generic_archiver.py b/tests/archivers/test_generic_archiver.py
index a35d28d..d493437 100644
--- a/tests/archivers/test_generic_archiver.py
+++ b/tests/archivers/test_generic_archiver.py
@@ -125,7 +125,6 @@ class TestGenericArchiver(TestArchiverBase):
         assert len(result.media) == 1
         assert result is not False
 
-    @pytest.mark.skip("Currently failing, multiple images are not being downloaded - this is due to an issue with ytdlp extractor")
     @pytest.mark.download
     def test_truthsocial_download_multiple_images(self, make_item):
         item = make_item("https://truthsocial.com/@trrth/posts/113861302149349135")

From 4bb4ebdf823819ea05bcb976dd5adf32ed88efeb Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 21 Jan 2025 16:36:45 +0100
Subject: [PATCH 16/20] Further cleanup, abstracts 'dropins' out into generic
 files

---
 .../archivers/generic_archiver/bluesky.py     | 132 ++++++------
 .../archivers/generic_archiver/dropin.py      |  58 +++++
 .../generic_archiver/generic_archiver.py      | 201 +++++++++---------
 .../archivers/generic_archiver/truth.py       |  73 ++++---
 .../archivers/generic_archiver/twitter.py     | 108 +++++-----
 src/auto_archiver/feeders/csv_feeder.py       |  41 ++++
 src/auto_archiver/utils/__init__.py           |   2 +-
 7 files changed, 371 insertions(+), 244 deletions(-)
 create mode 100644 src/auto_archiver/archivers/generic_archiver/dropin.py
 create mode 100644 src/auto_archiver/feeders/csv_feeder.py

diff --git a/src/auto_archiver/archivers/generic_archiver/bluesky.py b/src/auto_archiver/archivers/generic_archiver/bluesky.py
index 684124b..821d777 100644
--- a/src/auto_archiver/archivers/generic_archiver/bluesky.py
+++ b/src/auto_archiver/archivers/generic_archiver/bluesky.py
@@ -7,69 +7,75 @@ from loguru import logger
 from auto_archiver.core.context import ArchivingContext
 from auto_archiver.archivers.archiver import Archiver
 from auto_archiver.core.metadata import Metadata, Media
+from .dropin import GenericDropin, InfoExtractor
+
+class Bluesky(GenericDropin):
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        result = Metadata()
+        result.set_url(url)
+        result.set_title(post["record"]["text"])
+        result.set_timestamp(post["record"]["createdAt"])
+        for k, v in self._get_post_data(post).items():
+            if v: result.set(k, v)
+
+        # download if embeds present (1 video XOR >=1 images)
+        for media in self._download_bsky_embeds(post, archiver):
+            result.add_media(media)
+        logger.debug(f"Downloaded {len(result.media)} media files")
+
+        return result
+
+    def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
+        handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+        return ie_instance._extract_post(handle=handle, post_id=video_id)
+
+    def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
+        """
+        Iterates over image(s) or video in a Bluesky post and downloads them        
+        """
+        media = []
+        embed = post.get("record", {}).get("embed", {})
+        image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
+        video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
+
+        media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
+        for image_media in image_medias:
+            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
+            image_media = archiver.download_from_url(url)
+            media.append(image_media)
+        for video_media in video_medias:
+            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
+            video_media = archiver.download_from_url(url)
+            media.append(video_media)
+        return media
 
 
-def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
-    result = Metadata()
-    result.set_url(url)
-    result.set_title(post["record"]["text"])
-    result.set_timestamp(post["record"]["createdAt"])
-    for k, v in _get_post_data(post).items():
-        if v: result.set(k, v)
+    def _get_post_data(self, post: dict) -> dict:
+        """
+        Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
+        """
+        author = post["author"]
+        if "labels" in author and not author["labels"]:
+            del author["labels"]
+        if "associated" in author:
+            del author["associated"]
 
-    # download if embeds present (1 video XOR >=1 images)
-    for media in _download_bsky_embeds(post, archiver):
-        result.add_media(media)
-    logger.debug(f"Downloaded {len(result.media)} media files")
-
-    return result
-
-def _download_bsky_embeds(post: dict, archiver: Archiver) -> list[Media]:
-    """
-    Iterates over image(s) or video in a Bluesky post and downloads them        
-    """
-    media = []
-    embed = post.get("record", {}).get("embed", {})
-    image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
-    video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
-
-    media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
-    for image_media in image_medias:
-        url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
-        image_media = archiver.download_from_url(url)
-        media.append(image_media)
-    for video_media in video_medias:
-        url = media_url.format(video_media['ref']['$link'], post['author']['did'])
-        video_media = archiver.download_from_url(url)
-        media.append(video_media)
-    return media
-
-
-def _get_post_data(post: dict) -> dict:
-    """
-    Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
-    """
-    author = post["author"]
-    if "labels" in author and not author["labels"]:
-        del author["labels"]
-    if "associated" in author:
-        del author["associated"]
-
-    mentions, tags, links = [], [], []
-    facets = post.get("record", {}).get("facets", [])
-    for f in facets:
-        for feature in f["features"]:
-            if feature["$type"] == "app.bsky.richtext.facet#mention":
-                mentions.append(feature["did"])
-            elif feature["$type"] == "app.bsky.richtext.facet#tag":
-                tags.append(feature["tag"])
-            elif feature["$type"] == "app.bsky.richtext.facet#link":
-                links.append(feature["uri"])
-    res = {"author": author}
-    if mentions:
-        res["mentions"] = mentions
-    if tags:
-        res["tags"] = tags
-    if links:
-        res["links"] = links
-    return res
\ No newline at end of file
+        mentions, tags, links = [], [], []
+        facets = post.get("record", {}).get("facets", [])
+        for f in facets:
+            for feature in f["features"]:
+                if feature["$type"] == "app.bsky.richtext.facet#mention":
+                    mentions.append(feature["did"])
+                elif feature["$type"] == "app.bsky.richtext.facet#tag":
+                    tags.append(feature["tag"])
+                elif feature["$type"] == "app.bsky.richtext.facet#link":
+                    links.append(feature["uri"])
+        res = {"author": author}
+        if mentions:
+            res["mentions"] = mentions
+        if tags:
+            res["tags"] = tags
+        if links:
+            res["links"] = links
+        return res
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/generic_archiver/dropin.py b/src/auto_archiver/archivers/generic_archiver/dropin.py
new file mode 100644
index 0000000..37f3faf
--- /dev/null
+++ b/src/auto_archiver/archivers/generic_archiver/dropin.py
@@ -0,0 +1,58 @@
+from yt_dlp.extractor.common import InfoExtractor
+from auto_archiver.core.metadata import Metadata
+from auto_archiver.archivers.archiver import Archiver
+
+class GenericDropin:
+    """Base class for dropins for the generic extractor.
+    
+    In many instances, an extractor will exist in ytdlp, but it will only process videos.
+    Dropins can be created and used to make use of the already-written private code of a 
+    specific extractor from ytdlp.
+
+    The dropin should be able to handle the following methods:
+
+    - `get_post_data`: This method should be able to extract the post data from the url and return it as a dict.
+    - `create_metadata`: This method should be able to create a Metadata object from a post dict.
+
+    Optional methods include:
+
+    - `skip_ytdlp_download`: If you want to skip the ytdlp 'download' method all together, and do your own, then return True for this method.
+                             This is useful in cases where ytdlp might not work properly for all of your posts
+    - `keys_to_clean`: for the generic 'video_data' created by ytdlp (for video URLs), any additional fields you would like to clean out of the data before storing in metadata
+
+
+    """
+
+    def extract_post(self, url: str, ie_instance: InfoExtractor):
+        """
+        This method should return the post data from the url.
+        """
+        raise NotImplementedError("This method should be implemented in the subclass")
+    
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        """
+        This method should create a Metadata object from the post data.
+        """
+        raise NotImplementedError("This method should be implemented in the subclass")
+    
+
+    def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
+        """
+        This method should return True if you want to skip the ytdlp download method.
+        """
+        return False
+    
+    def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
+        """
+        This method should return a list of strings (keys) to clean from the video_data dict.
+
+        E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
+        """
+        return []
+    
+    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
+        """
+        This method should download any additional media from the post.
+        """
+        return metadata
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index 41f1314..511c7e4 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -1,16 +1,16 @@
 import datetime, os, yt_dlp, pysubs2
+import importlib
 from typing import Type
 from yt_dlp.extractor.common import InfoExtractor
 
 from loguru import logger
 
-from . import bluesky, twitter, truth
 from auto_archiver.archivers.archiver import Archiver
 from ...core import Metadata, Media, ArchivingContext
 
-
 class GenericArchiver(Archiver):
     name = "youtubedl_archiver" #left as is for backwards compat
+    _dropins = {}
 
     def __init__(self, config: dict) -> None:
         super().__init__(config)
@@ -22,23 +22,22 @@ class GenericArchiver(Archiver):
         self.allow_playlist = bool(self.allow_playlist)
         self.max_downloads = self.max_downloads
 
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
-            "subtitles": {"default": True, "help": "download subtitles if available"},
-            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
-            "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
-            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
-            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
-            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
-            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
-            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
-            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
-            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
-        }
+
+    def suitable_extractors(self, url: str) -> list[str]:
+        """
+        Returns a list of valid extractors for the given URL"""
+        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
+            if info_extractor.suitable(url) and info_extractor.working():
+                yield info_extractor
+        
+    def suitable(self, url: str) -> bool:
+        """
+        Checks for valid URLs out of all ytdlp extractors.
+        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
+        """
+        return any(self.suitable_extractors(url))
     
-    def download_additional_media(self, extractor_key: str, video_data: dict, metadata: Metadata) -> Metadata:
+    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
         """
         Downloads additional media like images, comments, subtitles, etc.
 
@@ -56,11 +55,18 @@ class GenericArchiver(Archiver):
             except Exception as e:
                 logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
 
+        dropin = self.dropin_for_extractor(info_extractor)
+        if dropin:
+            try:
+                metadata = dropin.download_additional_media(video_data, info_extractor, metadata)
+            except AttributeError:
+                pass
+
         return metadata
 
-    def keys_to_clean(self, extractor_key: str, video_data: dict) -> dict:
+    def keys_to_clean(self, info_extractor: InfoExtractor, video_data: dict) -> dict:
         """
-        Clean up the video data to make it more readable and remove unnecessary keys that ytdlp adds
+        Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
         """
 
         base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
@@ -71,23 +77,23 @@ class GenericArchiver(Archiver):
                      'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
                      '_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
                      'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
-        if extractor_key == 'TikTok':
-            # Tiktok: only has videos so a valid ytdlp `video_data` object is returned. Base keys are enough
-            return base_keys + [] 
-        elif extractor_key == "Bluesky":
-            # bluesky API response for non video URLs is already clean, nothing to add
-            return base_keys + []
-        
         
+        dropin = self.dropin_for_extractor(info_extractor)
+        if dropin:
+            try:
+                base_keys += dropin.keys_to_clean(video_data, info_extractor)
+            except AttributeError:
+                pass
+
         return base_keys
     
-    def add_metadata(self, extractor_key: str, video_data: dict, url:str, result: Metadata) -> Metadata:
+    def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
         """
-        Creates a Metadata object from the give video_data
+        Creates a Metadata object from the given video_data
         """
 
         # first add the media
-        result = self.download_additional_media(extractor_key, video_data, result)
+        result = self.download_additional_media(video_data, info_extractor, result)
 
         # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
         result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
@@ -110,7 +116,7 @@ class GenericArchiver(Archiver):
             result.set("upload_date", upload_date)
         
         # then clean away any keys we don't want
-        for clean_key in self.keys_to_clean(extractor_key, video_data):
+        for clean_key in self.keys_to_clean(info_extractor, video_data):
             video_data.pop(clean_key, None)
         
         # then add the rest of the video data
@@ -119,35 +125,6 @@ class GenericArchiver(Archiver):
                 result.set(k, v)
 
         return result
-    
-    def suitable_extractors(self, url: str) -> list[str]:
-        """
-        Returns a list of valid extractors for the given URL"""
-        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
-            if info_extractor.suitable(url) and info_extractor.working():
-                yield info_extractor
-        
-    def suitable(self, url: str) -> bool:
-        """
-        Checks for valid URLs out of all ytdlp extractors.
-        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
-        """
-        return any(self.suitable_extractors(url))
-
-    def create_metadata_for_post(self, info_extractor: InfoExtractor, post_data: dict, url: str) -> Metadata:
-        """
-        Standardizes the output of the 'post' data from a ytdlp InfoExtractor to Metadata object.
-
-        This is only required for platforms that don't have videos, and therefore cannot be converted into ytdlp valid 'video_data'.
-        In these instances, we need to use the extractor's _extract_post (or similar) method to get the post metadata, and then convert
-        it into a Metadata object via a platform-specific function.
-        """
-        if info_extractor.ie_key() == 'Bluesky':
-            return bluesky.create_metadata(post_data, self, url)
-        if info_extractor.ie_key() == 'Twitter':
-            return twitter.create_metadata(post_data, self, url)
-        if info_extractor.ie_key() == 'Truth':
-            return truth.create_metadata(post_data, self, url)
 
     def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
         """
@@ -156,45 +133,29 @@ class GenericArchiver(Archiver):
 
         ie_instance = info_extractor(downloader=ydl)
         post_data = None
-
-        if info_extractor.ie_key() == 'Bluesky':
-            # bluesky kwargs are handle, video_id
-            handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-            post_data = ie_instance._extract_post(handle=handle, post_id=video_id)
-        elif info_extractor.ie_key() == 'Twitter':
-            # twitter kwargs are tweet_id
-            twid = ie_instance._match_valid_url(url).group('id')
-            # TODO: if ytdlp PR https://github.com/yt-dlp/yt-dlp/pull/12098 is merged, change to _extract_post
-            post_data = ie_instance._extract_status(twid=twid)
-        elif info_extractor.ie_key() == 'Truth':
-            video_id = ie_instance._match_id(url)
-            truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
-            post_data = ie_instance._download_json(truthsocial_url, video_id)
-        else:
-            # lame attempt at trying to get data for an unknown extractor
-            # TODO: test some more video platforms and see if there's any improvement to be made
-            try:
-                post_data = ie_instance._extract_post(url)
-            except (NotImplementedError, AttributeError) as e:
-                logger.debug(f"Extractor {info_extractor.ie_key()} does not support extracting post info from non-video URLs: {e}")
-                return False
-
-        return self.create_metadata_for_post(ie_instance, post_data, url)
+        dropin = self.dropin_for_extractor(info_extractor)
+        if not dropin:
+            # TODO: add a proper link to 'how to create your own dropin'
+            logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
+                     Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
+            return False
         
-    def get_metadata_for_video(self, info: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        post_data = dropin.extract_post(url, ie_instance)
+        return dropin.create_metadata(post_data, ie_instance, self, url)
+
+    def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
 
         # this time download
         ydl.params['getcomments'] = self.comments
         #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
-        info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
-        if "entries" in info:
-            entries = info.get("entries", [])
+        data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        if "entries" in data:
+            entries = data.get("entries", [])
             if not len(entries):
                 logger.warning('YoutubeDLArchiver could not find any video')
                 return False
-        else: entries = [info]
+        else: entries = [data]
 
-        extractor_key = info['extractor_key']
         result = Metadata()
 
         for entry in entries:
@@ -209,7 +170,7 @@ class GenericArchiver(Archiver):
 
                 # read text from subtitles if enabled
                 if self.subtitles:
-                    for lang, val in (info.get('requested_subtitles') or {}).items():
+                    for lang, val in (data.get('requested_subtitles') or {}).items():
                         try:    
                             subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
                             text = " ".join([line.text for line in subs])
@@ -220,9 +181,49 @@ class GenericArchiver(Archiver):
             except Exception as e:
                 logger.error(f"Error processing entry {entry}: {e}")
 
-        return self.add_metadata(extractor_key, info, url, result)
+        return self.add_metadata(data, info_extractor, url, result)
+    
+    def dropin_for_extractor(self, info_extractor: Type[InfoExtractor], additional_paths = []):
+        dropin_name = info_extractor.ie_key().lower()
 
-    def download_for_extractor(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        if dropin_name == "generic":
+            # no need for a dropin for the generic extractor (?)
+            return None
+
+        dropin_class_name = dropin_name.title()
+        def _load_dropin(dropin):
+            dropin_class = getattr(dropin, dropin_class_name)()
+            return self._dropins.setdefault(dropin_name, dropin_class)
+
+        try:
+            return self._dropins[dropin_name]
+        except KeyError:
+            pass
+
+        # TODO: user should be able to pass --dropins="/some/folder,/other/folder" as a cmd line option
+        # which would allow the user to override the default dropins/add their own
+        paths = [] + additional_paths
+        for path in paths:
+            dropin_path = os.path.join(path, f"{dropin_name}.py")
+            dropin_spec = importlib.util.spec_from_file_location(dropin_name, dropin_path)
+            if not dropin_spec:
+                continue
+            try:
+                dropin = importlib.util.module_from_spec(dropin_spec)
+                dropin_spec.loader.exec_module(dropin)
+                return _load_dropin(dropin)
+            except (FileNotFoundError, ModuleNotFoundError):
+                pass
+        
+        # fallback to loading the dropins within auto-archiver
+        try:
+            return _load_dropin(importlib.import_module(f".{dropin_name}", package=__package__))
+        except ModuleNotFoundError:
+            pass
+
+        return None
+
+    def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
         """
         Tries to download the given url using the specified extractor
         
@@ -233,19 +234,19 @@ class GenericArchiver(Archiver):
         ydl.params['getcomments'] = False
         result = False
 
+        dropin_submodule = self.dropin_for_extractor(info_extractor)
+
         try:
-            if info_extractor.ie_key() == "Truth":
-                # the ytdlp truth extractor currently only gets the first image/video in the 'media' section, as opposed to all of them
-                # we don't want this
-                raise yt_dlp.utils.ExtractorError("Use the 'post data' method for Truth posts")
+            if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
+                raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
 
             # don't download since it can be a live stream
-            info = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
-            if info.get('is_live', False) and not self.livestreams:
+            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
+            if data.get('is_live', False) and not self.livestreams:
                 logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
                 return False
             # it's a valid video, that the youtubdedl can download out of the box
-            result = self.get_metadata_for_video(info, info_extractor, url, ydl)
+            result = self.get_metadata_for_video(data, info_extractor, url, ydl)
 
         except Exception as e:
             logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
diff --git a/src/auto_archiver/archivers/generic_archiver/truth.py b/src/auto_archiver/archivers/generic_archiver/truth.py
index 00551f3..bf19dce 100644
--- a/src/auto_archiver/archivers/generic_archiver/truth.py
+++ b/src/auto_archiver/archivers/generic_archiver/truth.py
@@ -1,39 +1,52 @@
-import datetime
+from typing import Type
 
 from auto_archiver.utils import traverse_obj
 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.archivers.archiver import Archiver
+from yt_dlp.extractor.common import InfoExtractor
 
 from dateutil.parser import parse as parse_dt
 
-def create_metadata(post: dict, archiver: Archiver, url: str) -> Metadata:
-    """
-    Creates metadata from a truth social post
-    
-    Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
-    
-    Format is:
-    
-    {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
-    """
-    breakpoint()
-    result = Metadata()
-    result.set_url(url)
-    timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
-    result.set_timestamp(parse_dt(timestamp))
-    result.set('description', post['content'])
-    result.set('author', post['account']['username'])
+from .dropin import GenericDropin
 
-    for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
-        if isinstance(key, tuple):
-            store_key = " ".join(key)
-        else:
-            store_key = key
-        result.set(store_key, traverse_obj(post, key))
-    
-    # add the media
-    for media in post.get('media_attachments', []):
-        filename = archiver.download_from_url(media['url'])
-        result.add_media(Media(filename), id=media.get('id'))
+class Truth(GenericDropin):
 
-    return result
\ No newline at end of file
+    def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
+        video_id = ie_instance._match_id(url)
+        truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
+        return ie_instance._download_json(truthsocial_url, video_id)
+
+    def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
+        return True
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        """
+        Creates metadata from a truth social post
+        
+        Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
+        
+        Format is:
+        
+        {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
+        """
+
+        result = Metadata()
+        result.set_url(url)
+        timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
+        result.set_timestamp(parse_dt(timestamp))
+        result.set('description', post['content'])
+        result.set('author', post['account']['username'])
+
+        for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
+            if isinstance(key, tuple):
+                store_key = " ".join(key)
+            else:
+                store_key = key
+            result.set(store_key, traverse_obj(post, key))
+        
+        # add the media
+        for media in post.get('media_attachments', []):
+            filename = archiver.download_from_url(media['url'])
+            result.add_media(Media(filename), id=media.get('id'))
+
+        return result
\ No newline at end of file
diff --git a/src/auto_archiver/archivers/generic_archiver/twitter.py b/src/auto_archiver/archivers/generic_archiver/twitter.py
index 8cc323c..ce6c28d 100644
--- a/src/auto_archiver/archivers/generic_archiver/twitter.py
+++ b/src/auto_archiver/archivers/generic_archiver/twitter.py
@@ -8,55 +8,63 @@ from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.utils import UrlUtil
 from auto_archiver.archivers.archiver import Archiver
 
+from .dropin import GenericDropin, InfoExtractor
 
-def choose_variant(variants):
-    # choosing the highest quality possible
-    variant, width, height = None, 0, 0
-    for var in variants:
-        if var.get("content_type", "") == "video/mp4":
-            width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
-            if width_height:
-                w, h = int(width_height[1]), int(width_height[2])
-                if w > width or h > height:
-                    width, height = w, h
-                    variant = var
-        else:
-            variant = var if not variant else variant
-    return variant
+class Twitter(GenericDropin):
 
-def create_metadata(tweet: dict, archiver: Archiver, url: str) -> Metadata:
-    result = Metadata()
-    try:
-        if not tweet.get("user") or not tweet.get("created_at"):
-            raise ValueError(f"Error retreiving post. Are you sure it exists?")
-        timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
-    except (ValueError, KeyError) as ex:
-        logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
-        return False
-            
-    result\
-        .set_title(tweet.get('full_text', ''))\
-        .set_content(json.dumps(tweet, ensure_ascii=False))\
-        .set_timestamp(timestamp)
-    if not tweet.get("entities", {}).get("media"):
-        logger.debug('No media found, archiving tweet text only')
-        result.status = "twitter-ytdl"
-        return result
-    for i, tw_media in enumerate(tweet["entities"]["media"]):
-        media = Media(filename="")
-        mimetype = ""
-        if tw_media["type"] == "photo":
-            media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
-            mimetype = "image/jpeg"
-        elif tw_media["type"] == "video":
-            variant = choose_variant(tw_media['video_info']['variants'])
-            media.set("src", variant['url'])
-            mimetype = variant['content_type']
-        elif tw_media["type"] == "animated_gif":
-            variant = tw_media['video_info']['variants'][0]
-            media.set("src", variant['url'])
-            mimetype = variant['content_type']
-        ext = mimetypes.guess_extension(mimetype)
-        media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
-        result.add_media(media)
-    return result
\ No newline at end of file
+
+    def choose_variant(self, variants):
+        # choosing the highest quality possible
+        variant, width, height = None, 0, 0
+        for var in variants:
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
+                if width_height:
+                    w, h = int(width_height[1]), int(width_height[2])
+                    if w > width or h > height:
+                        width, height = w, h
+                        variant = var
+            else:
+                variant = var if not variant else variant
+        return variant
+    
+    def extract_post(self, url: str, ie_instance: InfoExtractor):
+        twid = ie_instance._match_valid_url(url).group('id')
+        return ie_instance._extract_status(twid=twid)
+
+    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        result = Metadata()
+        try:
+            if not tweet.get("user") or not tweet.get("created_at"):
+                raise ValueError(f"Error retreiving post. Are you sure it exists?")
+            timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
+        except (ValueError, KeyError) as ex:
+            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
+            return False
+                
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(timestamp)
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            result.status = "twitter-ytdl"
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
+            result.add_media(media)
+        return result
\ No newline at end of file
diff --git a/src/auto_archiver/feeders/csv_feeder.py b/src/auto_archiver/feeders/csv_feeder.py
new file mode 100644
index 0000000..00bf7d7
--- /dev/null
+++ b/src/auto_archiver/feeders/csv_feeder.py
@@ -0,0 +1,41 @@
+from loguru import logger
+import csv
+
+from . import Feeder
+from ..core import Metadata, ArchivingContext
+from ..utils import url_or_none
+
+class CSVFeeder(Feeder):
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "files": {
+                "default": None,
+                "help": "Path to the input file(s) to read the URLs from, comma separated. \
+                        Input files should be formatted with one URL per line",
+                "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
+            },
+            "column": {
+                "default": None,
+                "help": "Column number or name to read the URLs from, 0-indexed",
+            }
+        }
+    
+
+    def __iter__(self) -> Metadata:
+        url_column = self.column or 0
+        for file in self.files:
+            with open(file, "r") as f:
+                reader = csv.reader(f)
+                first_row = next(reader)
+                if not(url_or_none(first_row[url_column])):
+                    # it's a header row, skip it
+                    logger.debug(f"Skipping header row: {first_row}")
+                for row in reader:
+                    url = row[0]
+                    logger.debug(f"Processing {url}")
+                    yield Metadata().set_url(url)
+            ArchivingContext.set("folder", "cli")
+
+        logger.success(f"Processed {len(self.urls)} URL(s)")
\ No newline at end of file
diff --git a/src/auto_archiver/utils/__init__.py b/src/auto_archiver/utils/__init__.py
index 50bddca..36ce765 100644
--- a/src/auto_archiver/utils/__init__.py
+++ b/src/auto_archiver/utils/__init__.py
@@ -7,4 +7,4 @@ from .url import UrlUtil
 from .atlos import get_atlos_config_options
 
 # handy utils from ytdlp
-from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none)
\ No newline at end of file
+from yt_dlp.utils import (clean_html, traverse_obj, strip_or_none, url_or_none)
\ No newline at end of file

From 7c0dcbfd8125cee9b3306e74a9a7e922a006c20d Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 21 Jan 2025 16:49:30 +0100
Subject: [PATCH 17/20] Re-add doc string to generic_archiver

(renamed from youtube_archiver)
---
 .../generic_archiver/generic_archiver.py      | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index 511c7e4..e339434 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -1,3 +1,27 @@
+"""
+This is the generic archiver used by auto-archiver, which uses `yt-dlp` under the hood.
+
+This module is responsible for downloading and processing media content from platforms
+supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
+for retrieving videos, subtitles, comments, and other metadata, and it integrates with
+the broader archiving framework.
+
+### Features
+- Supports downloading videos and playlists.
+- Retrieves metadata like titles, descriptions, upload dates, and durations.
+- Downloads subtitles and comments when enabled.
+- Configurable options for handling live streams, proxies, and more.
+
+### Dropins
+- For websites supported by `yt-dlp` that also contain posts in addition to videos
+ (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create 
+ metadata objects. Some dropins are included in this generic_archiver by default, but
+custom dropins can be created to handle additional websites and passed to the archiver
+via the command line using the `--dropins` option (TODO!).
+
+"""
+
+
 import datetime, os, yt_dlp, pysubs2
 import importlib
 from typing import Type

From 9dde9b26d0b84f59c6146c504ab1ed348c24c5b6 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 21 Jan 2025 16:49:49 +0100
Subject: [PATCH 18/20] Patch in upstream changes to ytdlp for now

Seems like ytdlp may not merge https://github.com/yt-dlp/yt-dlp/pull/12098 anytime soon
---
 .../archivers/generic_archiver/bluesky.py          | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/auto_archiver/archivers/generic_archiver/bluesky.py b/src/auto_archiver/archivers/generic_archiver/bluesky.py
index 821d777..7aa9c39 100644
--- a/src/auto_archiver/archivers/generic_archiver/bluesky.py
+++ b/src/auto_archiver/archivers/generic_archiver/bluesky.py
@@ -27,8 +27,20 @@ class Bluesky(GenericDropin):
         return result
 
     def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
+        # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
+        # handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+        # return ie_instance._extract_post(handle=handle, post_id=video_id)
+
         handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
-        return ie_instance._extract_post(handle=handle, post_id=video_id)
+        return ie_instance._download_json(
+            'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
+            video_id, query={
+                'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
+                'depth': 0,
+                'parentHeight': 0,
+            })['thread']['post']
+
+
 
     def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
         """

From d3e3eb76395f787911d3e9dbac47facbf020bb56 Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 21 Jan 2025 16:58:18 +0100
Subject: [PATCH 19/20] unit tests for loading dropins

---
 .../generic_archiver/generic_archiver.py      | 13 ++++++-------
 tests/archivers/test_generic_archiver.py      | 19 ++++++++++++++++---
 tests/data/dropin.py                          |  5 +++++
 3 files changed, 27 insertions(+), 10 deletions(-)
 create mode 100644 tests/data/dropin.py

diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index e339434..59cd3f8 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -79,7 +79,7 @@ class GenericArchiver(Archiver):
             except Exception as e:
                 logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
 
-        dropin = self.dropin_for_extractor(info_extractor)
+        dropin = self.dropin_for_name(info_extractor.ie_key())
         if dropin:
             try:
                 metadata = dropin.download_additional_media(video_data, info_extractor, metadata)
@@ -102,7 +102,7 @@ class GenericArchiver(Archiver):
                      '_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
                      'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
         
-        dropin = self.dropin_for_extractor(info_extractor)
+        dropin = self.dropin_for_name(info_extractor.ie_key())
         if dropin:
             try:
                 base_keys += dropin.keys_to_clean(video_data, info_extractor)
@@ -157,7 +157,7 @@ class GenericArchiver(Archiver):
 
         ie_instance = info_extractor(downloader=ydl)
         post_data = None
-        dropin = self.dropin_for_extractor(info_extractor)
+        dropin = self.dropin_for_name(info_extractor.ie_key())
         if not dropin:
             # TODO: add a proper link to 'how to create your own dropin'
             logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
@@ -207,8 +207,7 @@ class GenericArchiver(Archiver):
 
         return self.add_metadata(data, info_extractor, url, result)
     
-    def dropin_for_extractor(self, info_extractor: Type[InfoExtractor], additional_paths = []):
-        dropin_name = info_extractor.ie_key().lower()
+    def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
 
         if dropin_name == "generic":
             # no need for a dropin for the generic extractor (?)
@@ -241,7 +240,7 @@ class GenericArchiver(Archiver):
         
         # fallback to loading the dropins within auto-archiver
         try:
-            return _load_dropin(importlib.import_module(f".{dropin_name}", package=__package__))
+            return _load_dropin(importlib.import_module(f".{dropin_name}", package=package))
         except ModuleNotFoundError:
             pass
 
@@ -258,7 +257,7 @@ class GenericArchiver(Archiver):
         ydl.params['getcomments'] = False
         result = False
 
-        dropin_submodule = self.dropin_for_extractor(info_extractor)
+        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
 
         try:
             if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
diff --git a/tests/archivers/test_generic_archiver.py b/tests/archivers/test_generic_archiver.py
index d493437..6e249e8 100644
--- a/tests/archivers/test_generic_archiver.py
+++ b/tests/archivers/test_generic_archiver.py
@@ -1,10 +1,12 @@
-import pytest
 from pathlib import Path
-import datetime 
+import datetime
 import os
 
-from auto_archiver.archivers.generic_archiver import GenericArchiver
+from os.path import dirname
 
+import pytest
+
+from auto_archiver.archivers.generic_archiver import GenericArchiver
 from .test_archiver_base import TestArchiverBase
 
 class TestGenericArchiver(TestArchiverBase):
@@ -23,6 +25,17 @@ class TestGenericArchiver(TestArchiverBase):
         'cookies_from_browser': False,
         'cookie_file': None,
         }
+    
+    def test_load_dropin(self):
+        # test loading dropins that are in the generic_archiver package
+        package = "auto_archiver.archivers.generic_archiver"
+        assert self.archiver.dropin_for_name("bluesky", package=package)
+
+        # test loading dropings via filepath
+        path = os.path.join(dirname(dirname(__file__)), "data/")
+        assert self.archiver.dropin_for_name("dropin", additional_paths=[path])
+
+
 
     @pytest.mark.parametrize("url, is_suitable", [
         ("https://www.youtube.com/watch?v=5qap5aO4i9A", True),
diff --git a/tests/data/dropin.py b/tests/data/dropin.py
new file mode 100644
index 0000000..0049c48
--- /dev/null
+++ b/tests/data/dropin.py
@@ -0,0 +1,5 @@
+# this is a dummy class used to test importing a dropin in the
+#  generic extractor by filename/path
+
+class Dropin:
+    pass
\ No newline at end of file

From cd2ae3763fa338ee86621be2f318d061edcd3cbd Mon Sep 17 00:00:00 2001
From: Patrick Robertson <robertson.patrick@gmail.com>
Date: Tue, 21 Jan 2025 16:24:37 +0000
Subject: [PATCH 20/20] Minor adjustments

Co-authored-by: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com>
---
 .../archivers/generic_archiver/generic_archiver.py           | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
index 59cd3f8..729d6ef 100644
--- a/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
+++ b/src/auto_archiver/archivers/generic_archiver/generic_archiver.py
@@ -150,13 +150,12 @@ class GenericArchiver(Archiver):
 
         return result
 
-    def get_metatdata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+    def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
         """
         Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
         """
 
         ie_instance = info_extractor(downloader=ydl)
-        post_data = None
         dropin = self.dropin_for_name(info_extractor.ie_key())
         if not dropin:
             # TODO: add a proper link to 'how to create your own dropin'
@@ -274,7 +273,7 @@ class GenericArchiver(Archiver):
         except Exception as e:
             logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
             try:
-                result = self.get_metatdata_for_post(info_extractor, url, ydl)
+                result = self.get_metadata_for_post(info_extractor, url, ydl)
             except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
                 logger.error(f'Error downloading metadata for post: {post_e}')
                 return False