removed broken scrapers and added basic README

2026-06-07 19:08:35 +03:00 · 2023-08-04 09:15:53 -05:00
parent ef9292bc90
commit 30bb4e43e4
23 changed files with 12 additions and 1003 deletions
--- a/README.md
+++ b/README.md
@@ -1,4 +1,10 @@
 Cisticola
 ==========

+The *cisticola* application enables users to easily collect, process, and analyze large-scale data from several social media platforms.
+
+It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
+
+For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. 
+
 ![Cisticola, the bird](docs/images/cisticola.jpeg)
--- a/cisticola/scraper/init.py
+++ b/cisticola/scraper/init.py
@@ -1,12 +1,6 @@
 from cisticola.utils import make_request
 from .base import Scraper, ScraperController, ChannelDoesNotExistError
 from .bitchute import BitchuteScraper
-from .gab import GabScraper 
 from .gettr import GettrScraper
-from .instagram import InstagramScraper
-from .odysee import OdyseeScraper
 from .rumble import RumbleScraper
-from .telegram_telethon import TelegramTelethonScraper
-from .twitter import TwitterScraper
-from .vkontakte import VkontakteScraper
-from .youtube import YoutubeScraper
+from .telegram_telethon import TelegramTelethonScraper
--- a/cisticola/scraper/gab.py
+++ b/cisticola/scraper/gab.py
@@ -1,108 +0,0 @@
-from datetime import datetime, timezone, date
-import json
-from typing import Generator
-import os 
-from loguru import logger
-
-from gabber.client import Client, GAB_API_BASE_URL
-
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper.base import Scraper
-
-class GabScraper(Scraper):
-    """An implementation of a Scraper for Gab, using gabber library"""
-    __version__ = "GabScraper 0.0.0"
-
-    def get_username_from_url(self, url):
-        username = url.split('https://gab.com/')[-1]
-
-        return username
-
-    def get_group_id_from_url(self, url):
-        group_id = int(url.split('/')[-1])
-
-        return group_id
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-        client = Client(
-            username = os.environ['GAB_USER'],
-            password = os.environ['GAB_PASS'],
-            threads = 25)
-
-        if channel.url.split('/')[-2] == 'groups':
-
-            group_id = self.get_group_id_from_url(url = channel.url)
-            scraper = client.pull_group_posts(
-                id = group_id,
-                depth = float('inf')) 
-        else:
-
-            username = self.get_username_from_url(channel.url)
-
-            result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
-            user_id = int(result['id'])
-
-            scraper = client.pull_statuses(
-                id = user_id,
-                created_after = date.min,
-                replies = False)
-
-        for post in scraper:
-            if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
-                break
-
-            archived_urls = {}
-
-            for attachment in post.get('media_attachments'):
-                if attachment.get('type') == 'video':
-                    archived_urls[attachment['source_mp4']] = None
-                else:
-                    archived_urls[attachment['url']] = None
-                    
-            if post.get('reblog') is not None:
-                for attachment in post['reblog'].get('media_attachments'):
-                    if attachment.get('type') == 'video':
-                        archived_urls[attachment['source_mp4']] = None
-                    else:
-                        archived_urls[attachment['url']] = None
-
-            yield ScraperResult(
-                scraper=self.__version__,
-                platform="Gab",
-                channel=channel.id,
-                platform_id=post['id'],
-                date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
-                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post),
-                archived_urls=archived_urls,
-                media_archived=None)
-
-    def can_handle(self, channel: Channel) -> bool:
-        if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
-            return True
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-
-        client = Client(
-            username = os.environ['GAB_USER'],
-            password = os.environ['GAB_PASS'],
-            threads = 25)
-
-        if channel.url.split('/')[-2] == 'groups':
-
-            group_id = self.get_group_id_from_url(url = channel.url)
-            profile = client.pull_group(id = group_id)
-        
-        else:
-
-            username = self.get_username_from_url(channel.url)
-
-            profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
-
-        return RawChannelInfo(scraper=self.__version__,
-            platform=channel.platform,
-            channel=channel.id,
-            raw_data=json.dumps(profile),
-            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/instagram.py
+++ b/cisticola/scraper/instagram.py
@@ -1,126 +0,0 @@
-from typing import Generator, List
-from datetime import datetime, timezone
-import os
-import json
-import tempfile
-from pathlib import Path
-
-from loguru import logger
-import instaloader 
-
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper.base import Scraper
-
-BASE_URL = 'https://www.instagram.com/'
-
-CONTENT_TYPES = {
-    'jpg' : 'image/jpeg',
-    'mp4' : 'video/mp4'}
-
-class InstagramScraper(Scraper):
-    """An implementation of a Scraper for Instagram, using instaloader library"""
-    __version__ = "InstagramScraper 0.0.0"
-
-    def get_username_from_url(self, url):
-        username = url.split(BASE_URL)[1].strip('/')
-        return username
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-
-        username = self.get_username_from_url(channel.url)
-
-        loader = instaloader.Instaloader(
-            quiet = True,
-            download_comments = False,
-            save_metadata = False)
-            
-        loader.login(
-            user = os.environ['INSTAGRAM_USERNAME'], 
-            passwd = os.environ['INSTAGRAM_PASSWORD'])
-
-        profile = instaloader.Profile.from_username(
-            context = loader.context, 
-            username = username)
-
-        for post in profile.get_posts():
-
-            if since is not None and post.date_utc <= since.date:
-                break
-
-            post_url = f'{BASE_URL}p/{post.shortcode}/'
-
-            archived_urls = get_archived_urls_from_post(post = post)
-
-            yield ScraperResult(
-                scraper=self.__version__,
-                platform="Instagram",
-                channel=channel.id,
-                platform_id=post.mediaid,
-                date=post.date_utc,
-                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(post._asdict(), default=str),
-                archived_urls=archived_urls,
-                media_archived=None)
-
-            for comment in post.get_comments():
-
-                comment_dict = comment._asdict()
-                comment_dict['post_url'] = post_url 
-                comment_dict['is_comment'] = True
-
-                yield ScraperResult(
-                    scraper=self.__version__,
-                    platform="Instagram",
-                    channel=channel.id,
-                    platform_id=post.mediaid,
-                    date=comment.created_at_utc,
-                    date_archived=datetime.now(timezone.utc),
-                    raw_data=json.dumps(comment_dict, default=str),
-                    archived_urls={},
-                    media_archived=datetime.now(timezone.utc))
-
-    def can_handle(self, channel):
-        if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
-            return True
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-
-        username = self.get_username_from_url(channel.url)
-
-        loader = instaloader.Instaloader(
-            quiet = True,
-            download_comments = False,
-            save_metadata = False)
-
-        loader.login(
-            user = os.environ['INSTAGRAM_USERNAME'], 
-            passwd = os.environ['INSTAGRAM_PASSWORD'])
-
-        user_profile = instaloader.Profile.from_username(
-            context = loader.context, 
-            username = username)
-        
-        profile = user_profile._asdict()
-        profile['followers'] = user_profile.followers
-        profile['followees'] = user_profile.followees
-
-        return RawChannelInfo(scraper=self.__version__,
-            platform=channel.platform,
-            channel=channel.id,
-            raw_data=json.dumps(profile),
-            date_archived=datetime.now(timezone.utc))
-
-def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
-    typename = post._node['__typename']
-    if typename == 'GraphImage':
-        urls = [post._node['display_url']]
-    elif typename == 'GraphVideo':
-        urls = [post._node['video_url']]
-    elif typename == 'GraphSidecar':
-        urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
-    else:
-        raise NotImplementedError(f'post of type {typename} is currently not supported.')
-        
-    return {url : None for url in urls}
--- a/cisticola/scraper/odysee.py
+++ b/cisticola/scraper/odysee.py
@@ -1,110 +0,0 @@
-from datetime import datetime, timezone
-import json
-from typing import Generator
-from urllib.parse import urlparse
-
-import requests
-from loguru import logger
-
-from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
-from polyphemus.api import get_auth_token, get_all_comments
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper.base import Scraper
-
-class OdyseeScraper(Scraper):
-    """An implementation of a Scraper for Odysee, using polyphemus library"""
-    __version__ = "OdyseeScraper 0.0.0"
-
-    def __init__(self):
-        super().__init__()
-        self.auth_token = get_auth_token()
-
-    def get_username_from_url(self, url):
-
-        username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
-
-        return username
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-
-        username = self.get_username_from_url(channel.url)
-        scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
-        
-        all_videos = scraper.get_all_videos()
-
-        for video in all_videos:
-            if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
-                break
-
-            url = video.streaming_url
-            if url is None:
-                archived_urls = {}
-            else:
-                archived_urls = {url: None}
-
-            raw_comment_info_list = get_all_comments(video_id=video.claim_id)
-            all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
-
-            yield ScraperResult(
-                scraper=self.__version__,
-                platform="Odysee",
-                channel=channel.id,
-                platform_id=video.claim_id,
-                date=video.created.replace(tzinfo=timezone.utc),
-                date_archived=datetime.now(timezone.utc),
-                raw_data=json.dumps(video.__dict__, default = str),
-                archived_urls=archived_urls,
-                media_archived=None)
-
-            for comment in all_comments:
-
-                yield ScraperResult(
-                    scraper=self.__version__,
-                    platform="Odysee",
-                    channel=channel.id,
-                    platform_id=comment.claim_id,
-                    date=comment.created.replace(tzinfo=timezone.utc),
-                    date_archived=datetime.now(),
-                    raw_data=json.dumps(comment.__dict__, default = str),
-                    archived_urls={},
-                    media_archived=datetime.now(timezone.utc))
-
-    @logger.catch
-    def archive_files(self, result: ScraperResult) -> ScraperResult:
-        for url in result.archived_urls:
-            if result.archived_urls[url] is None:
-                r = requests.head(url)
-                if r.headers['Content-Type'] == 'text/html; charset=utf-8':
-                    media_blob, content_type, key = self.m3u8_url_to_blob(url)
-                else:
-                    media_blob, content_type, key = self.url_to_blob(url)
-
-                archived_url = self.archive_blob(media_blob, content_type, key)
-                result.archived_urls[url] = archived_url
-
-        result.media_archived = datetime.now(timezone.utc)
-        return result
-
-    def can_handle(self, channel):
-        if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
-            return True
-
-    def url_to_key(self, url: str, content_type: str) -> str:
-        key = urlparse(url).path.split('/')[-2]
-        ext = content_type.split('/')[-1]
-
-        return f'{key}.{ext}'
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-
-        username = self.get_username_from_url(channel.url)
-        scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
-        profile = scraper.get_entity().__dict__
-
-        return RawChannelInfo(scraper=self.__version__,
-            platform=channel.platform,
-            channel=channel.id,
-            raw_data=json.dumps(profile, default = str),
-            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -1,108 +0,0 @@
-from datetime import datetime, timezone
-from typing import Generator
-from urllib.parse import urlparse, parse_qs
-from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
-from loguru import logger
-import json
-
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
-
-class TwitterScraper(Scraper):
-    """An implementation of a Scraper for Twitter, using snscrape library"""
-    __version__ = "TwitterScraper 0.0.0"
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-        if channel.platform_id:
-            identifier = int(channel.platform_id)
-        else:
-            identifier = channel.screenname
-
-        scraper = TwitterProfileScraper(identifier)
-
-        first = True
-
-        for tweet in scraper.get_items():
-            if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
-                # with TwitterProfileScraper, the first tweet could be an old pinned tweet
-                if first:
-                    first = False
-                    continue
-                else:
-                    break
-
-            archived_urls = {}
-
-            media_list = []
-            if tweet.media:
-                media_list += tweet.media
-
-            if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media:
-                media_list += tweet.retweetedTweet.media
-
-            if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media:
-                media_list += tweet.quotedTweet.media
-
-            for media in media_list:
-                if type(media) == Video:
-                    variant = max(
-                        [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
-                    url = variant.url
-                elif type(media) == Gif:
-                    url = media.variants[0].url
-                elif type(media) == Photo:
-                    url = media.fullUrl
-                else:
-                    logger.warning(f"Could not get media URL of {media}")
-                    url = None
-
-                if url is not None and url not in archived_urls:
-                    archived_urls[url] = None
-
-            yield ScraperResult(
-                scraper=self.__version__,
-                platform="Twitter",
-                channel=channel.id,
-                platform_id=tweet.id,
-                date=tweet.date,
-                date_archived=datetime.now(timezone.utc),
-                raw_data=tweet.json(),
-                archived_urls=archived_urls,
-                media_archived=None)
-
-    def can_handle(self, channel):
-        if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
-            return True
-
-    def url_to_key(self, url: str, content_type: str) -> str:
-        parsed_url = urlparse(url)
-        queries = parse_qs(parsed_url.query)
-
-        ext = ''
-
-        # TODO might require additional statements for other media formats
-        if 'jpg' in queries.get('format', []):
-            ext = '.jpg'
-        elif 'png' in queries.get('format', []):
-            ext = '.png'
-        elif parsed_url.path.endswith('.mp4'):
-            ext = ''
-
-        key = parsed_url.path.split('/')[-1] + ext
-        return key 
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-
-        scraper = TwitterUserScraper(channel.screenname)
-        entity = scraper._get_entity()
-
-        if entity is None:
-            raise ChannelDoesNotExistError(channel.url)
-        else:   
-            return RawChannelInfo(scraper=self.__version__,
-                platform=channel.platform,
-                channel=channel.id,
-                raw_data=json.dumps(entity.__dict__, default=str),
-                date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/vkontakte.py
+++ b/cisticola/scraper/vkontakte.py
@@ -1,107 +0,0 @@
-from datetime import datetime, timezone
-from typing import Generator
-from urllib.parse import urlparse
-import json 
-import re 
-
-from snscrape.modules.vkontakte import VKontakteUserScraper
-from loguru import logger
-from yt_dlp.extractor.vk import VKIE
-
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper.base import Scraper
-
-class VkontakteScraper(Scraper):
-    """An implementation of a Scraper for Vkontakte, using snscrape library"""
-    __version__ = "VkontakteScraper 0.0.1"
-
-    def get_username_from_url(self, url):
-        username = url.split('https://vk.com/')[1]
-
-        return username
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-
-        username = self.get_username_from_url(channel.url)
-        scraper = VKontakteUserScraper(username)
-
-        first = True
-
-        for post in scraper.get_items():
-            if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
-                # with VKontakteUserScraper, the first tweet could be an old pinned tweet
-                if first:
-                    first = False
-                    continue
-                else:
-                    break
-
-            archived_urls = {}
-
-            if post.photos:
-
-                for photo in post.photos:
-                    variant = max(
-                        [v for v in photo.variants], key=lambda v: v.width * v.height)
-                    url = variant.url
-                    if url is not None:
-                        archived_urls[url] = None
-
-            if post.video:
-                archived_urls[post.video.url] = None
-
-            yield ScraperResult(
-                scraper=self.__version__,
-                platform="VK",
-                channel=channel.id,
-                platform_id=post.url.split('/')[-1],
-                date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
-                date_archived=datetime.now(timezone.utc),
-                raw_data=post.json(),
-                archived_urls=archived_urls,
-                media_archived=None)
-
-    @logger.catch
-    def archive_files(self, result: ScraperResult) -> ScraperResult:
-        for url in result.archived_urls:
-            if result.archived_urls[url] is None:
-                if re.match(VKIE._VALID_URL, url):
-                    # Uses regex from yt_dlp to verify VK video URL
-                    media_blob, content_type, key = self.ytdlp_url_to_blob(url)
-                else:
-                    media_blob, content_type, key = self.url_to_blob(url)
-                archived_url = self.archive_blob(media_blob, content_type, key)
-                result.archived_urls[url] = archived_url
-
-        result.media_archived = datetime.now(timezone.utc)
-        return result
-
-
-    def can_handle(self, channel):
-        if channel.platform == "VK":
-            return True
-
-    def url_to_key(self, url: str, content_type: str) -> str:
-        path = urlparse(url).path
-        if path.endswith('.jpg'):
-            key = '_'.join(path.split('/')[-2:])
-        else:
-            ext = '.mp4'
-            key = path.split('/')[-1] + ext
-            
-        return key
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-
-        username = self.get_username_from_url(channel.url)
-        scraper = VKontakteUserScraper(username)
-        
-        profile = scraper._get_entity().__dict__
-
-        return RawChannelInfo(scraper=self.__version__,
-            platform=channel.platform,
-            channel=channel.id,
-            raw_data=json.dumps(profile),
-            date_archived=datetime.now(timezone.utc))
--- a/cisticola/scraper/youtube.py
+++ b/cisticola/scraper/youtube.py
@@ -1,154 +0,0 @@
-from datetime import datetime, timezone
-import json
-from typing import Generator
-import tempfile
-from pathlib import Path
-import os
-
-import yt_dlp
-from loguru import logger
-
-from cisticola.base import Channel, ScraperResult, RawChannelInfo
-from cisticola.scraper import Scraper
-
-class YoutubeScraper(Scraper):
-    """An implementation of a Scraper for Youtube, using youtube-dl"""
-    __version__ = "YoutubeScraper 0.0.1"
-
-    cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
-    cookiefilename = 'cookiefile.txt'
-
-    @logger.catch
-    def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
-
-        content_type = 'video/mp4'
-
-        if since is None:
-            since_date = datetime.min
-            start_date = None
-        else:
-            since_date = since.date
-            start_date = since.date.strftime('%Y%m%d')
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-
-            cookiefile = Path(temp_dir)/self.cookiefilename
-            with open(cookiefile, 'w') as f:
-                f.write(self.cookiestring)
-
-            daterange = yt_dlp.utils.DateRange(start = start_date)
-
-            ydl_opts = {
-                "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
-                "merge_output_format": "mp4",
-                "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
-                "daterange" : daterange,
-                "quiet": True,
-                "verbose": False,
-                "retries": 5,
-                "cookiefile": cookiefile}
-
-            ydl = yt_dlp.YoutubeDL(ydl_opts)
-
-            try:
-                meta = ydl.extract_info(
-                    channel.url,
-                    download=False)
-            except yt_dlp.utils.DownloadError as e:
-                raise e
-            else:
-                videos = meta['entries']
-                valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
-                        
-                for video in valid_videos:
-
-                    url = video['webpage_url']
-
-                    archived_urls = {url: None}
-                    
-                    video_id = video["id"]
-
-                    yield ScraperResult(
-                        scraper=self.__version__,
-                        platform="Youtube",
-                        channel=channel.id,
-                        platform_id=video_id,
-                        date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
-                        date_archived=datetime.now(timezone.utc),
-                        raw_data=json.dumps(video, default = str),
-                        archived_urls=archived_urls,
-                        media_archived=None)
-                        
-    def can_handle(self, channel):
-        if channel.platform == "Youtube" and channel.url:
-            return True
-
-    @logger.catch
-    def archive_files(self, result: ScraperResult) -> ScraperResult:
-        for url in result.archived_urls:
-            if result.archived_urls[url] is None:
-
-                media_blob = None
-
-                with tempfile.TemporaryDirectory() as temp_dir:
-
-                    cookiefile = Path(temp_dir)/self.cookiefilename
-                    with open(cookiefile, 'w') as f:
-                        f.write(self.cookiestring)
-
-                    ydl_opts = {
-                        "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
-                        "merge_output_format": "mp4",
-                        "outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
-                        "quiet": True,
-                        "verbose": False,
-                        "retries": 5,
-                        "cookiefile": cookiefile}
-
-                    ydl = yt_dlp.YoutubeDL(ydl_opts)
-
-                    try:
-                        ydl.download(url)
-                    except yt_dlp.utils.DownloadError as e:
-                        raise e
-                        
-                    files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
-                    if len(files) != 1:
-                        logger.warning(f'{len(files)} files downloaded for video: {url}')
-                    key = files[0]
-                    with open(Path(temp_dir, key), 'rb') as f:
-                        media_blob = f.read()
-
-                if media_blob is not None:
-                    content_type = 'video/mp4'            
-                    archived_url = self.archive_blob(media_blob, content_type, key)
-                    result.archived_urls[url] = archived_url
-
-        result.media_archived = datetime.now(timezone.utc)
-        return result
-
-    @logger.catch
-    def get_profile(self, channel: Channel) -> RawChannelInfo:
-        
-        ydl_opts = {
-            "quiet": True,
-            "verbose": False,
-            "retries": 5}
-
-        ydl = yt_dlp.YoutubeDL(ydl_opts)
-
-        meta = None
-        try:
-            meta = ydl.extract_info(
-                channel.url,
-                process=False)
-            meta.pop('entries')
-
-            return RawChannelInfo(scraper=self.__version__,
-                platform=channel.platform,
-                channel=channel.id,
-                raw_data=json.dumps(meta),
-                date_archived=datetime.now(timezone.utc))
-
-        except yt_dlp.utils.DownloadError as e:
-            raise e
--- a/cisticola/transformer/init.py
+++ b/cisticola/transformer/init.py
@@ -1,7 +1,5 @@
 from .base import ETLController
-from .twitter import TwitterTransformer
 from .bitchute import BitchuteTransformer
 from .telegram_telethon import TelegramTelethonTransformer
 from .rumble import RumbleTransformer
-from .gettr import GettrTransformer
-from .vkontakte import VkontakteTransformer
+from .gettr import GettrTransformer
--- a/cisticola/transformer/twitter.py
+++ b/cisticola/transformer/twitter.py
@@ -1,137 +0,0 @@
-import json
-from loguru import logger
-from typing import Generator, Union, Callable
-import dateutil.parser
-from datetime import datetime, timezone
-
-from cisticola.transformer.base import Transformer 
-from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
-
-class TwitterTransformer(Transformer):
-    """A Twitter specific ScraperResult, with a method ETL/transforming"""
-
-    __version__ = "TwitterTransformer 0.0.1"
-
-    def can_handle(self, data: ScraperResult) -> bool:
-        scraper = data.scraper.split(' ')
-        if scraper[0] == "TwitterScraper":
-            return True
-
-        return False        
-
-    def process_media(self, tweet, post_id, data):
-        if tweet['media']:
-            for media in tweet['media']:
-                orig = None
-
-                if media["_type"] == "snscrape.modules.twitter.Photo":
-                    orig = media["fullUrl"]
-                elif media["_type"] == "snscrape.modules.twitter.Gif":
-                    orig = media["variants"][0]["url"]
-                elif media["_type"] == "snscrape.modules.twitter.Video":
-                    variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
-                    orig = variant["url"]
-                
-                if orig is None:
-                    logger.warning(f"No media URL found for {media}")
-                elif orig not in data.archived_urls:
-                    logger.info("Media discovered but not archived")
-                else:
-                    new = data.archived_urls[orig]
-
-                    if media["_type"] == "snscrape.modules.twitter.Photo":
-                        m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
-                    else:
-                        m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)
-
-                    yield m
-
-    def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
-        raw = json.loads(data.raw_data)
-
-        transformed = ChannelInfo(
-            raw_channel_info_id=data.id,
-            channel=data.channel,
-            platform_id=raw['id'],
-            platform=data.platform,
-            scraper=data.scraper,
-            transformer=self.__version__,
-            screenname=raw['username'],
-            name=raw['displayname'],
-            description=raw['rawDescription'],
-            description_url=raw['linkUrl'],
-            description_location=raw['location'],
-            followers=raw['followersCount'],
-            following=raw['friendsCount'],
-            verified=raw['verified'],
-            date_created=dateutil.parser.parse(raw['created']),
-            date_archived=data.date_archived,
-            date_transformed=datetime.now(timezone.utc)
-        )
-
-        transformed = insert(transformed)
-
-
-    def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
-        raw = json.loads(data.raw_data)
-
-        transformed = Post(
-            raw_id=data.id,
-            platform_id=raw['id'],
-            scraper=data.scraper,
-            transformer=self.__version__,
-            platform=data.platform,
-            channel=data.channel,
-            date=dateutil.parser.parse(raw['date']),
-            date_archived=data.date_archived,
-            date_transformed=datetime.now(timezone.utc),
-            url=raw['url'],
-            content=raw['content'],
-            author_id=raw['user']['id'],
-            author_username=raw['user']['username'])
-
-        def subtweet(tweet):
-            channel = Channel(
-                name=tweet['user']['displayname'],
-                platform_id=tweet['user']['id'],
-                platform=data.platform,
-                url=tweet['user']['url'],
-                screenname=tweet['user']['username'],
-                category='forwarded',
-                source=self.__version__
-                )
-
-            channel = insert(channel)
-
-            original = Post(
-                raw_id=data.id,
-                platform_id=tweet['id'],
-                scraper=data.scraper,
-                transformer=self.__version__,
-                platform=data.platform,
-                channel=channel.id,
-                date=dateutil.parser.parse(tweet['date']),
-                date_archived=data.date_archived,
-                date_transformed=datetime.now(timezone.utc),
-                url=tweet['url'],
-                content=tweet['content'],
-                author_id=tweet['user']['id'],
-                author_username=tweet['user']['username']
-            )
-
-            original = insert(original)
-            transformed.forwarded_from = channel.id
-            transformed.reply_to = original.id
-
-            media = self.process_media(tweet, original.id, data)
-            for m in media:
-                insert(m)
-
-        if raw['retweetedTweet'] is not None:
-            subtweet(raw['retweetedTweet'])
-
-        if raw['quotedTweet'] is not None:
-            subtweet(raw['quotedTweet'])
-
-        #insert_post
-        insert_post(transformed)
--- a/cisticola/transformer/vkontakte.py
+++ b/cisticola/transformer/vkontakte.py
@@ -1,74 +0,0 @@
-import json
-from loguru import logger
-from typing import Generator, Union, Callable
-import dateutil.parser
-from datetime import datetime, timezone
-from sqlalchemy import func
-
-from cisticola.transformer.base import Transformer 
-from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
-
-class VkontakteTransformer(Transformer):
-    """A Vkontakte specific ScraperResult, with a method ETL/transforming"""
-
-    __version__ = "VkontakteTransformer 0.0.1"
-
-    def can_handle(self, data: ScraperResult) -> bool:
-        scraper = data.scraper.split(' ')
-        if scraper[0] == "VkontakteScraper":
-            return True
-
-        return False        
-
-    def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
-        raw = json.loads(data.raw_data)
-
-        transformed = ChannelInfo(
-            raw_channel_info_id=data.id,
-            channel=data.channel,
-            platform_id=raw['username'],
-            platform=data.platform,
-            scraper=data.scraper,
-            transformer=self.__version__,
-            screenname=raw['username'],
-            name=raw['name'],
-            description=raw.get('description'),
-            description_url=raw.get('websites'),
-            description_location=None,
-            followers=int(raw['followers']) if raw['followers'] else None,
-            following=-1,
-            verified=raw['verified'],
-            date_archived=data.date_archived,
-            date_created=None,
-            date_transformed=datetime.now(timezone.utc)
-        )
-
-        transformed = insert(transformed)
-
-
-    def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
-        raw = json.loads(data.raw_data)           
-
-        transformed = Post(
-            raw_id=data.id,
-            platform_id=data.platform_id,
-            scraper=data.scraper,
-            transformer=self.__version__,
-            platform=data.platform,
-            channel=data.channel,
-            date=data.date,
-            date_archived=data.date_archived,
-            date_transformed=datetime.now(timezone.utc),
-            url=raw['url'],
-            content=raw['content'] if raw['content'] else '',
-            author_id = None,
-            author_username=None,
-            outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
-            )
-
-        # insert_post
-        insert_post(transformed)
-
-        # media = self.process_media(raw, transformed.id, data)
-        # for m in media:
-        #     insert(m)
--- a/docs/edit_apidoc.sh
+++ b/docs/edit_apidoc.sh
@@ -5,7 +5,7 @@ HIDE_COOKIESTRING="   :exclude-members: cookiestring"
 REPLACE_MAXDEPTH="s/   :maxdepth: 4/   :maxdepth: 1/g"

 # Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable
-for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst
+for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst
 do
    echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file;
 done
--- a/docs/source/cisticola.scraper.gab.rst
+++ b/docs/source/cisticola.scraper.gab.rst
@@ -1,7 +0,0 @@
-cisticola.scraper.gab module
-============================
-
-.. automodule:: cisticola.scraper.gab
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.scraper.instagram.rst
+++ b/docs/source/cisticola.scraper.instagram.rst
@@ -1,7 +0,0 @@
-cisticola.scraper.instagram module
-==================================
-
-.. automodule:: cisticola.scraper.instagram
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.scraper.odysee.rst
+++ b/docs/source/cisticola.scraper.odysee.rst
@@ -1,7 +0,0 @@
-cisticola.scraper.odysee module
-===============================
-
-.. automodule:: cisticola.scraper.odysee
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.scraper.rst
+++ b/docs/source/cisticola.scraper.rst
@@ -14,12 +14,6 @@ Submodules

   cisticola.scraper.base
   cisticola.scraper.bitchute
-   cisticola.scraper.gab
   cisticola.scraper.gettr
-   cisticola.scraper.instagram
-   cisticola.scraper.odysee
   cisticola.scraper.rumble
   cisticola.scraper.telegram_telethon
-   cisticola.scraper.twitter
-   cisticola.scraper.vkontakte
-   cisticola.scraper.youtube
--- a/docs/source/cisticola.scraper.twitter.rst
+++ b/docs/source/cisticola.scraper.twitter.rst
@@ -1,7 +0,0 @@
-cisticola.scraper.twitter module
-================================
-
-.. automodule:: cisticola.scraper.twitter
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.scraper.vkontakte.rst
+++ b/docs/source/cisticola.scraper.vkontakte.rst
@@ -1,7 +0,0 @@
-cisticola.scraper.vkontakte module
-==================================
-
-.. automodule:: cisticola.scraper.vkontakte
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.scraper.youtube.rst
+++ b/docs/source/cisticola.scraper.youtube.rst
@@ -1,8 +0,0 @@
-cisticola.scraper.youtube module
-================================
-
-.. automodule:: cisticola.scraper.youtube
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members: cookiestring
--- a/docs/source/cisticola.transformer.rst
+++ b/docs/source/cisticola.transformer.rst
@@ -17,5 +17,3 @@ Submodules
   cisticola.transformer.gettr
   cisticola.transformer.rumble
   cisticola.transformer.telegram_telethon
-   cisticola.transformer.twitter
-   cisticola.transformer.vkontakte
--- a/docs/source/cisticola.transformer.twitter.rst
+++ b/docs/source/cisticola.transformer.twitter.rst
@@ -1,7 +0,0 @@
-cisticola.transformer.twitter module
-====================================
-
-.. automodule:: cisticola.transformer.twitter
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/cisticola.transformer.vkontakte.rst
+++ b/docs/source/cisticola.transformer.vkontakte.rst
@@ -1,7 +0,0 @@
-cisticola.transformer.vkontakte module
-======================================
-
-.. automodule:: cisticola.transformer.vkontakte
-   :members:
-   :undoc-members:
-   :show-inheritance:
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -34,7 +34,7 @@ If you do not already have a Telegram application, you can create one by followi

 To initialize a Telegram session, run the following script from the package's root directory using the command-line:

-.. bash::
+.. code-block:: console

    bash telethon_session_init.py

@@ -43,13 +43,13 @@ Documentation

 The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:

-.. code-block::
+.. code-block:: console

    pipenv run make html

 For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:

-.. code-block::
+.. code-block:: console

    pipenv run make apidoc