Initial changes to move to '__manifest__' format

2026-06-13 05:38:29 +03:00 · 2025-01-21 19:02:38 +01:00
parent 03f3770223
commit 241b35002c
15 changed files with 211 additions and 121 deletions
--- a/src/auto_archiver/modules/generic_extractor/init.py
+++ b/src/auto_archiver/modules/generic_extractor/init.py
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@@ -0,0 +1,33 @@
+{
+    'name': 'Generic Extractor',
+    'version': '0.1.0',
+    'author': 'Bellingcat',
+    'type': ['extractor'],
+    'entry_point': 'generic_extractor:GenericExtractor',
+    'requires_setup': False,
+    'depends': ['core'],
+    'external_dependencies': {
+        'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
+    },
+    'description': """
+This is the generic extractor used by auto-archiver, which uses `yt-dlp` under the hood.
+
+This module is responsible for downloading and processing media content from platforms
+supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functionality
+for retrieving videos, subtitles, comments, and other metadata, and it integrates with
+the broader archiving framework.
+
+### Features
+- Supports downloading videos and playlists.
+- Retrieves metadata like titles, descriptions, upload dates, and durations.
+- Downloads subtitles and comments when enabled.
+- Configurable options for handling live streams, proxies, and more.
+
+### Dropins
+- For websites supported by `yt-dlp` that also contain posts in addition to videos
+ (e.g. Facebook, Twitter, Bluesky), dropins can be created to extract post data and create 
+ metadata objects. Some dropins are included in this generic_archiver by default, but
+custom dropins can be created to handle additional websites and passed to the archiver
+via the command line using the `--dropins` option (TODO!).
+"""
+}
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@@ -0,0 +1,93 @@
+import os
+import mimetypes
+
+import requests
+from loguru import logger
+
+from auto_archiver.core.context import ArchivingContext
+from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.core.metadata import Metadata, Media
+from .dropin import GenericDropin, InfoExtractor
+
+class Bluesky(GenericDropin):
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        result = Metadata()
+        result.set_url(url)
+        result.set_title(post["record"]["text"])
+        result.set_timestamp(post["record"]["createdAt"])
+        for k, v in self._get_post_data(post).items():
+            if v: result.set(k, v)
+
+        # download if embeds present (1 video XOR >=1 images)
+        for media in self._download_bsky_embeds(post, archiver):
+            result.add_media(media)
+        logger.debug(f"Downloaded {len(result.media)} media files")
+
+        return result
+
+    def extract_post(self, url: str, ie_instance: InfoExtractor) -> dict:
+        # TODO: If/when this PR (https://github.com/yt-dlp/yt-dlp/pull/12098) is merged on ytdlp, remove the comments and delete the code below
+        # handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+        # return ie_instance._extract_post(handle=handle, post_id=video_id)
+
+        handle, video_id = ie_instance._match_valid_url(url).group('handle', 'id')
+        return ie_instance._download_json(
+            'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread',
+            video_id, query={
+                'uri': f'at://{handle}/app.bsky.feed.post/{video_id}',
+                'depth': 0,
+                'parentHeight': 0,
+            })['thread']['post']
+
+
+
+    def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
+        """
+        Iterates over image(s) or video in a Bluesky post and downloads them        
+        """
+        media = []
+        embed = post.get("record", {}).get("embed", {})
+        image_medias = embed.get("images", []) + embed.get("media", {}).get("images", [])
+        video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e]
+
+        media_url = "https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={}&did={}"
+        for image_media in image_medias:
+            url = media_url.format(image_media['image']['ref']['$link'], post['author']['did'])
+            image_media = archiver.download_from_url(url)
+            media.append(image_media)
+        for video_media in video_medias:
+            url = media_url.format(video_media['ref']['$link'], post['author']['did'])
+            video_media = archiver.download_from_url(url)
+            media.append(video_media)
+        return media
+
+
+    def _get_post_data(self, post: dict) -> dict:
+        """
+        Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links.
+        """
+        author = post["author"]
+        if "labels" in author and not author["labels"]:
+            del author["labels"]
+        if "associated" in author:
+            del author["associated"]
+
+        mentions, tags, links = [], [], []
+        facets = post.get("record", {}).get("facets", [])
+        for f in facets:
+            for feature in f["features"]:
+                if feature["$type"] == "app.bsky.richtext.facet#mention":
+                    mentions.append(feature["did"])
+                elif feature["$type"] == "app.bsky.richtext.facet#tag":
+                    tags.append(feature["tag"])
+                elif feature["$type"] == "app.bsky.richtext.facet#link":
+                    links.append(feature["uri"])
+        res = {"author": author}
+        if mentions:
+            res["mentions"] = mentions
+        if tags:
+            res["tags"] = tags
+        if links:
+            res["links"] = links
+        return res
--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@@ -0,0 +1,58 @@
+from yt_dlp.extractor.common import InfoExtractor
+from auto_archiver.core.metadata import Metadata
+from auto_archiver.archivers.archiver import Archiver
+
+class GenericDropin:
+    """Base class for dropins for the generic extractor.
+    
+    In many instances, an extractor will exist in ytdlp, but it will only process videos.
+    Dropins can be created and used to make use of the already-written private code of a 
+    specific extractor from ytdlp.
+
+    The dropin should be able to handle the following methods:
+
+    - `get_post_data`: This method should be able to extract the post data from the url and return it as a dict.
+    - `create_metadata`: This method should be able to create a Metadata object from a post dict.
+
+    Optional methods include:
+
+    - `skip_ytdlp_download`: If you want to skip the ytdlp 'download' method all together, and do your own, then return True for this method.
+                             This is useful in cases where ytdlp might not work properly for all of your posts
+    - `keys_to_clean`: for the generic 'video_data' created by ytdlp (for video URLs), any additional fields you would like to clean out of the data before storing in metadata
+
+
+    """
+
+    def extract_post(self, url: str, ie_instance: InfoExtractor):
+        """
+        This method should return the post data from the url.
+        """
+        raise NotImplementedError("This method should be implemented in the subclass")
+    
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        """
+        This method should create a Metadata object from the post data.
+        """
+        raise NotImplementedError("This method should be implemented in the subclass")
+    
+
+    def skip_ytdlp_download(self, url: str, ie_instance: InfoExtractor):
+        """
+        This method should return True if you want to skip the ytdlp download method.
+        """
+        return False
+    
+    def keys_to_clean(self, video_data: dict, info_extractor: InfoExtractor):
+        """
+        This method should return a list of strings (keys) to clean from the video_data dict.
+
+        E.g. ["uploader", "uploader_id", "tiktok_specific_field"]
+        """
+        return []
+    
+    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata):
+        """
+        This method should download any additional media from the post.
+        """
+        return metadata
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@@ -0,0 +1,297 @@
+import datetime, os, yt_dlp, pysubs2
+import importlib
+from typing import Type
+from yt_dlp.extractor.common import InfoExtractor
+
+from loguru import logger
+
+from auto_archiver.archivers.archiver import Archiver
+from ...core import Metadata, Media, ArchivingContext
+
+class GenericExtractor(Archiver):
+    name = "youtubedl_archiver" #left as is for backwards compat
+    _dropins = {}
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+        self.subtitles = bool(self.subtitles)
+        self.comments = bool(self.comments)
+        self.livestreams = bool(self.livestreams)
+        self.live_from_start = bool(self.live_from_start)
+        self.end_means_success = bool(self.end_means_success)
+        self.allow_playlist = bool(self.allow_playlist)
+        self.max_downloads = self.max_downloads
+
+
+    def suitable_extractors(self, url: str) -> list[str]:
+        """
+        Returns a list of valid extractors for the given URL"""
+        for info_extractor in yt_dlp.YoutubeDL()._ies.values():
+            if info_extractor.suitable(url) and info_extractor.working():
+                yield info_extractor
+        
+    def suitable(self, url: str) -> bool:
+        """
+        Checks for valid URLs out of all ytdlp extractors.
+        Returns False for the GenericIE, which as labelled by yt-dlp: 'Generic downloader that works on some sites'
+        """
+        return any(self.suitable_extractors(url))
+    
+    def download_additional_media(self, video_data: dict, info_extractor: InfoExtractor, metadata: Metadata) -> Metadata:
+        """
+        Downloads additional media like images, comments, subtitles, etc.
+
+        Creates a 'media' object and attaches it to the metadata object.
+        """
+
+        # Just get the main thumbnail. More thumbnails are available in
+        # video_data['thumbnails'] should they be required
+        thumbnail_url = video_data.get('thumbnail')
+        if thumbnail_url:
+            try:
+                cover_image_path = self.download_from_url(thumbnail_url)
+                media = Media(cover_image_path)
+                metadata.add_media(media, id="cover")
+            except Exception as e:
+                logger.error(f"Error downloading cover image {thumbnail_url}: {e}")
+
+        dropin = self.dropin_for_name(info_extractor.ie_key())
+        if dropin:
+            try:
+                metadata = dropin.download_additional_media(video_data, info_extractor, metadata)
+            except AttributeError:
+                pass
+
+        return metadata
+
+    def keys_to_clean(self, info_extractor: InfoExtractor, video_data: dict) -> dict:
+        """
+        Clean up the ytdlp generic video data to make it more readable and remove unnecessary keys that ytdlp adds
+        """
+
+        base_keys = ['formats', 'thumbnail', 'display_id', 'epoch', 'requested_downloads',
+                     'duration_string', 'thumbnails', 'http_headers', 'webpage_url_basename', 'webpage_url_domain',
+                     'extractor', 'extractor_key', 'playlist', 'playlist_index', 'duration_string', 'protocol', 'requested_subtitles',
+                     'format_id', 'acodec', 'vcodec', 'ext', 'epoch', '_has_drm', 'filesize', 'audio_ext', 'video_ext', 'vbr', 'abr',
+                     'resolution', 'dynamic_range', 'aspect_ratio', 'cookies', 'format', 'quality', 'preference', 'artists',
+                     'channel_id', 'subtitles', 'tbr', 'url', 'original_url', 'automatic_captions', 'playable_in_embed', 'live_status',
+                     '_format_sort_fields', 'chapters', 'requested_formats', 'format_note',
+                     'audio_channels', 'asr', 'fps', 'was_live', 'is_live', 'heatmap', 'age_limit', 'stretched_ratio']
+        
+        dropin = self.dropin_for_name(info_extractor.ie_key())
+        if dropin:
+            try:
+                base_keys += dropin.keys_to_clean(video_data, info_extractor)
+            except AttributeError:
+                pass
+
+        return base_keys
+    
+    def add_metadata(self, video_data: dict, info_extractor: InfoExtractor, url:str, result: Metadata) -> Metadata:
+        """
+        Creates a Metadata object from the given video_data
+        """
+
+        # first add the media
+        result = self.download_additional_media(video_data, info_extractor, result)
+
+        # keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
+        result.set_title(video_data.pop('title', video_data.pop('fulltitle', "")))
+        result.set_url(url)
+
+        # extract comments if enabled
+        if self.comments:
+            result.set("comments", [{
+                "text": c["text"],
+                "author": c["author"], 
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
+            } for c in video_data.get("comments", [])])
+
+        # then add the common metadata
+        if timestamp := video_data.pop("timestamp", None):
+            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
+            result.set_timestamp(timestamp)
+        if upload_date := video_data.pop("upload_date", None):
+            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
+            result.set("upload_date", upload_date)
+        
+        # then clean away any keys we don't want
+        for clean_key in self.keys_to_clean(info_extractor, video_data):
+            video_data.pop(clean_key, None)
+        
+        # then add the rest of the video data
+        for k, v in video_data.items():
+            if v:
+                result.set(k, v)
+
+        return result
+
+    def get_metadata_for_post(self, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        """
+        Calls into the ytdlp InfoExtract subclass to use the prive _extract_post method to get the post metadata.
+        """
+
+        ie_instance = info_extractor(downloader=ydl)
+        dropin = self.dropin_for_name(info_extractor.ie_key())
+        if not dropin:
+            # TODO: add a proper link to 'how to create your own dropin'
+            logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}.
+                     Why not try creating your own, and make sure it has a valid function called 'create_metadata'. Learn more: https://auto-archiver.readthedocs.io/en/latest/user_guidelines.html#""")
+            return False
+        
+        post_data = dropin.extract_post(url, ie_instance)
+        return dropin.create_metadata(post_data, ie_instance, self, url)
+
+    def get_metadata_for_video(self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+
+        # this time download
+        ydl.params['getcomments'] = self.comments
+        #TODO: for playlist or long lists of videos, how to download one at a time so they can be stored before the next one is downloaded?
+        data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=True)
+        if "entries" in data:
+            entries = data.get("entries", [])
+            if not len(entries):
+                logger.warning('YoutubeDLArchiver could not find any video')
+                return False
+        else: entries = [data]
+
+        result = Metadata()
+
+        for entry in entries:
+            try:
+                filename = ydl.prepare_filename(entry)
+                if not os.path.exists(filename):
+                    filename = filename.split('.')[0] + '.mkv'
+
+                new_media = Media(filename)
+                for x in ["duration", "original_url", "fulltitle", "description", "upload_date"]:
+                    if x in entry: new_media.set(x, entry[x])
+
+                # read text from subtitles if enabled
+                if self.subtitles:
+                    for lang, val in (data.get('requested_subtitles') or {}).items():
+                        try:    
+                            subs = pysubs2.load(val.get('filepath'), encoding="utf-8")
+                            text = " ".join([line.text for line in subs])
+                            new_media.set(f"subtitles_{lang}", text)
+                        except Exception as e:
+                            logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
+                result.add_media(new_media)
+            except Exception as e:
+                logger.error(f"Error processing entry {entry}: {e}")
+
+        return self.add_metadata(data, info_extractor, url, result)
+    
+    def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]:
+
+        if dropin_name == "generic":
+            # no need for a dropin for the generic extractor (?)
+            return None
+
+        dropin_class_name = dropin_name.title()
+        def _load_dropin(dropin):
+            dropin_class = getattr(dropin, dropin_class_name)()
+            return self._dropins.setdefault(dropin_name, dropin_class)
+
+        try:
+            return self._dropins[dropin_name]
+        except KeyError:
+            pass
+
+        # TODO: user should be able to pass --dropins="/some/folder,/other/folder" as a cmd line option
+        # which would allow the user to override the default dropins/add their own
+        paths = [] + additional_paths
+        for path in paths:
+            dropin_path = os.path.join(path, f"{dropin_name}.py")
+            dropin_spec = importlib.util.spec_from_file_location(dropin_name, dropin_path)
+            if not dropin_spec:
+                continue
+            try:
+                dropin = importlib.util.module_from_spec(dropin_spec)
+                dropin_spec.loader.exec_module(dropin)
+                return _load_dropin(dropin)
+            except (FileNotFoundError, ModuleNotFoundError):
+                pass
+        
+        # fallback to loading the dropins within auto-archiver
+        try:
+            return _load_dropin(importlib.import_module(f".{dropin_name}", package=package))
+        except ModuleNotFoundError:
+            pass
+
+        return None
+
+    def download_for_extractor(self, info_extractor: InfoExtractor, url: str, ydl: yt_dlp.YoutubeDL) -> Metadata:
+        """
+        Tries to download the given url using the specified extractor
+        
+        It first tries to use ytdlp directly to download the video. If the post is not a video, it will then try to
+        use the extractor's _extract_post method to get the post metadata if possible.
+        """
+        # when getting info without download, we also don't need the comments
+        ydl.params['getcomments'] = False
+        result = False
+
+        dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
+
+        try:
+            if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
+                raise Exception(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
+
+            # don't download since it can be a live stream
+            data = ydl.extract_info(url, ie_key=info_extractor.ie_key(), download=False)
+            if data.get('is_live', False) and not self.livestreams:
+                logger.warning("Livestream detected, skipping due to 'livestreams' configuration setting")
+                return False
+            # it's a valid video, that the youtubdedl can download out of the box
+            result = self.get_metadata_for_video(data, info_extractor, url, ydl)
+
+        except Exception as e:
+            logger.debug(f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead')
+            try:
+                result = self.get_metadata_for_post(info_extractor, url, ydl)
+            except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as post_e:
+                logger.error(f'Error downloading metadata for post: {post_e}')
+                return False
+            except Exception as generic_e:
+                logger.debug(f'Attempt to extract using ytdlp extractor "{info_extractor.IE_NAME}" failed:  \n  {repr(generic_e)}', exc_info=True)
+                return False
+        
+        if result:
+            extractor_name = "yt-dlp"
+            if info_extractor:
+                extractor_name += f"_{info_extractor.ie_key()}"
+
+            if self.end_means_success:
+                result.success(extractor_name)
+            else:
+                result.status = extractor_name
+
+        return result
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+
+        if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
+            logger.debug('Using Facebook cookie')
+            yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
+        
+        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
+
+        if item.netloc in ['youtube.com', 'www.youtube.com']:
+            if self.cookies_from_browser:
+                logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
+                ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
+            elif self.cookie_file:
+                logger.debug(f'Using cookies from file {self.cookie_file}')
+                ydl_options['cookiefile'] = self.cookie_file
+
+        ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
+
+        for info_extractor in self.suitable_extractors(url):
+            result = self.download_for_extractor(info_extractor, url, ydl)
+            if result:
+                return result
+       
+
+        return False
--- a/src/auto_archiver/modules/generic_extractor/truth.py
+++ b/src/auto_archiver/modules/generic_extractor/truth.py
@@ -0,0 +1,52 @@
+from typing import Type
+
+from auto_archiver.utils import traverse_obj
+from auto_archiver.core.metadata import Metadata, Media
+from auto_archiver.archivers.archiver import Archiver
+from yt_dlp.extractor.common import InfoExtractor
+
+from dateutil.parser import parse as parse_dt
+
+from .dropin import GenericDropin
+
+class Truth(GenericDropin):
+
+    def extract_post(self, url, ie_instance: InfoExtractor) -> dict:
+        video_id = ie_instance._match_id(url)
+        truthsocial_url = f'https://truthsocial.com/api/v1/statuses/{video_id}'
+        return ie_instance._download_json(truthsocial_url, video_id)
+
+    def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
+        return True
+
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        """
+        Creates metadata from a truth social post
+        
+        Only used for posts that contain no media. ytdlp.TruthIE extractor can handle posts with media
+        
+        Format is:
+        
+        {'id': '109598702184774628', 'created_at': '2022-12-29T19:51:18.161Z', 'in_reply_to_id': None, 'quote_id': None, 'in_reply_to_account_id': None, 'sensitive': False, 'spoiler_text': '', 'visibility': 'public', 'language': 'en', 'uri': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'url': 'https://truthsocial.com/@bbcnewa/109598702184774628', 'content': '<p>Pele, regarded by many as football\'s greatest ever player, has died in Brazil at the age of 82. <a href="https://www.bbc.com/sport/football/42751517" rel="nofollow noopener noreferrer" target="_blank"><span class="invisible">https://www.</span><span class="ellipsis">bbc.com/sport/football/4275151</span><span class="invisible">7</span></a></p>', 'account': {'id': '107905163010312793', 'username': 'bbcnewa', 'acct': 'bbcnewa', 'display_name': 'BBC News', 'locked': False, 'bot': False, 'discoverable': True, 'group': False, 'created_at': '2022-03-05T17:42:01.159Z', 'note': '<p>News, features and analysis by the BBC</p>', 'url': 'https://truthsocial.com/@bbcnewa', 'avatar': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'avatar_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/avatars/107/905/163/010/312/793/original/e7c07550dc22c23a.jpeg', 'header': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'header_static': 'https://static-assets-1.truthsocial.com/tmtg:prime-ts-assets/accounts/headers/107/905/163/010/312/793/original/a00eeec2b57206c7.jpeg', 'followers_count': 1131, 'following_count': 3, 'statuses_count': 9, 'last_status_at': '2024-11-12', 'verified': False, 'location': '', 'website': 'https://www.bbc.com/news', 'unauth_visibility': True, 'chats_onboarded': True, 'feeds_onboarded': True, 'accepting_messages': False, 'show_nonmember_group_statuses': None, 'emojis': [], 'fields': [], 'tv_onboarded': True, 'tv_account': False}, 'media_attachments': [], 'mentions': [], 'tags': [], 'card': None, 'group': None, 'quote': None, 'in_reply_to': None, 'reblog': None, 'sponsored': False, 'replies_count': 1, 'reblogs_count': 0, 'favourites_count': 2, 'favourited': False, 'reblogged': False, 'muted': False, 'pinned': False, 'bookmarked': False, 'poll': None, 'emojis': []}
+        """
+
+        result = Metadata()
+        result.set_url(url)
+        timestamp = post['created_at'] # format is 2022-12-29T19:51:18.161Z
+        result.set_timestamp(parse_dt(timestamp))
+        result.set('description', post['content'])
+        result.set('author', post['account']['username'])
+
+        for key in ['replies_count', 'reblogs_count', 'favourites_count', ('account', 'followers_count'), ('account', 'following_count'), ('account', 'statuses_count'), ('account', 'display_name'), 'language', 'in_reply_to_account', 'replies_count']:
+            if isinstance(key, tuple):
+                store_key = " ".join(key)
+            else:
+                store_key = key
+            result.set(store_key, traverse_obj(post, key))
+        
+        # add the media
+        for media in post.get('media_attachments', []):
+            filename = archiver.download_from_url(media['url'])
+            result.add_media(Media(filename), id=media.get('id'))
+
+        return result
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@@ -0,0 +1,70 @@
+import re, mimetypes, json
+from datetime import datetime
+
+from loguru import logger
+from slugify import slugify
+
+from auto_archiver.core.metadata import Metadata, Media
+from auto_archiver.utils import UrlUtil
+from auto_archiver.archivers.archiver import Archiver
+
+from .dropin import GenericDropin, InfoExtractor
+
+class Twitter(GenericDropin):
+
+
+    def choose_variant(self, variants):
+        # choosing the highest quality possible
+        variant, width, height = None, 0, 0
+        for var in variants:
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
+                if width_height:
+                    w, h = int(width_height[1]), int(width_height[2])
+                    if w > width or h > height:
+                        width, height = w, h
+                        variant = var
+            else:
+                variant = var if not variant else variant
+        return variant
+    
+    def extract_post(self, url: str, ie_instance: InfoExtractor):
+        twid = ie_instance._match_valid_url(url).group('id')
+        return ie_instance._extract_status(twid=twid)
+
+    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+        result = Metadata()
+        try:
+            if not tweet.get("user") or not tweet.get("created_at"):
+                raise ValueError(f"Error retreiving post. Are you sure it exists?")
+            timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
+        except (ValueError, KeyError) as ex:
+            logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
+            return False
+                
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(timestamp)
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            result.status = "twitter-ytdl"
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = archiver.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}')
+            result.add_media(media)
+        return result