diff --git a/README.md b/README.md index 1b86dab..66cfb8a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,10 @@ Cisticola ========== +The *cisticola* application enables users to easily collect, process, and analyze large-scale data from several social media platforms. + +It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database. + +For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation. + ![Cisticola, the bird](docs/images/cisticola.jpeg) diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 0889b38..7b7d12e 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,12 +1,6 @@ from cisticola.utils import make_request from .base import Scraper, ScraperController, ChannelDoesNotExistError from .bitchute import BitchuteScraper -from .gab import GabScraper from .gettr import GettrScraper -from .instagram import InstagramScraper -from .odysee import OdyseeScraper from .rumble import RumbleScraper -from .telegram_telethon import TelegramTelethonScraper -from .twitter import TwitterScraper -from .vkontakte import VkontakteScraper -from .youtube import YoutubeScraper \ No newline at end of file +from .telegram_telethon import TelegramTelethonScraper \ No newline at end of file diff --git a/cisticola/scraper/gab.py b/cisticola/scraper/gab.py deleted file mode 100644 index b4bdd3d..0000000 --- a/cisticola/scraper/gab.py +++ /dev/null @@ -1,108 +0,0 @@ -from datetime import datetime, timezone, date -import json -from typing import Generator -import os -from loguru import logger - -from gabber.client import Client, GAB_API_BASE_URL - -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper.base import Scraper - -class GabScraper(Scraper): - """An implementation of a Scraper for Gab, using gabber library""" - __version__ = "GabScraper 0.0.0" - - def get_username_from_url(self, url): - username = url.split('https://gab.com/')[-1] - - return username - - def get_group_id_from_url(self, url): - group_id = int(url.split('/')[-1]) - - return group_id - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - client = Client( - username = os.environ['GAB_USER'], - password = os.environ['GAB_PASS'], - threads = 25) - - if channel.url.split('/')[-2] == 'groups': - - group_id = self.get_group_id_from_url(url = channel.url) - scraper = client.pull_group_posts( - id = group_id, - depth = float('inf')) - else: - - username = self.get_username_from_url(channel.url) - - result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() - user_id = int(result['id']) - - scraper = client.pull_statuses( - id = user_id, - created_after = date.min, - replies = False) - - for post in scraper: - if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): - break - - archived_urls = {} - - for attachment in post.get('media_attachments'): - if attachment.get('type') == 'video': - archived_urls[attachment['source_mp4']] = None - else: - archived_urls[attachment['url']] = None - - if post.get('reblog') is not None: - for attachment in post['reblog'].get('media_attachments'): - if attachment.get('type') == 'video': - archived_urls[attachment['source_mp4']] = None - else: - archived_urls[attachment['url']] = None - - yield ScraperResult( - scraper=self.__version__, - platform="Gab", - channel=channel.id, - platform_id=post['id'], - date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post), - archived_urls=archived_urls, - media_archived=None) - - def can_handle(self, channel: Channel) -> bool: - if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: - return True - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - client = Client( - username = os.environ['GAB_USER'], - password = os.environ['GAB_PASS'], - threads = 25) - - if channel.url.split('/')[-2] == 'groups': - - group_id = self.get_group_id_from_url(url = channel.url) - profile = client.pull_group(id = group_id) - - else: - - username = self.get_username_from_url(channel.url) - - profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() - - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/instagram.py b/cisticola/scraper/instagram.py deleted file mode 100644 index 04c0ca4..0000000 --- a/cisticola/scraper/instagram.py +++ /dev/null @@ -1,126 +0,0 @@ -from typing import Generator, List -from datetime import datetime, timezone -import os -import json -import tempfile -from pathlib import Path - -from loguru import logger -import instaloader - -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper.base import Scraper - -BASE_URL = 'https://www.instagram.com/' - -CONTENT_TYPES = { - 'jpg' : 'image/jpeg', - 'mp4' : 'video/mp4'} - -class InstagramScraper(Scraper): - """An implementation of a Scraper for Instagram, using instaloader library""" - __version__ = "InstagramScraper 0.0.0" - - def get_username_from_url(self, url): - username = url.split(BASE_URL)[1].strip('/') - return username - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - - username = self.get_username_from_url(channel.url) - - loader = instaloader.Instaloader( - quiet = True, - download_comments = False, - save_metadata = False) - - loader.login( - user = os.environ['INSTAGRAM_USERNAME'], - passwd = os.environ['INSTAGRAM_PASSWORD']) - - profile = instaloader.Profile.from_username( - context = loader.context, - username = username) - - for post in profile.get_posts(): - - if since is not None and post.date_utc <= since.date: - break - - post_url = f'{BASE_URL}p/{post.shortcode}/' - - archived_urls = get_archived_urls_from_post(post = post) - - yield ScraperResult( - scraper=self.__version__, - platform="Instagram", - channel=channel.id, - platform_id=post.mediaid, - date=post.date_utc, - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(post._asdict(), default=str), - archived_urls=archived_urls, - media_archived=None) - - for comment in post.get_comments(): - - comment_dict = comment._asdict() - comment_dict['post_url'] = post_url - comment_dict['is_comment'] = True - - yield ScraperResult( - scraper=self.__version__, - platform="Instagram", - channel=channel.id, - platform_id=post.mediaid, - date=comment.created_at_utc, - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(comment_dict, default=str), - archived_urls={}, - media_archived=datetime.now(timezone.utc)) - - def can_handle(self, channel): - if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: - return True - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - username = self.get_username_from_url(channel.url) - - loader = instaloader.Instaloader( - quiet = True, - download_comments = False, - save_metadata = False) - - loader.login( - user = os.environ['INSTAGRAM_USERNAME'], - passwd = os.environ['INSTAGRAM_PASSWORD']) - - user_profile = instaloader.Profile.from_username( - context = loader.context, - username = username) - - profile = user_profile._asdict() - profile['followers'] = user_profile.followers - profile['followees'] = user_profile.followees - - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) - -def get_archived_urls_from_post(post: instaloader.Post) -> List[str]: - typename = post._node['__typename'] - if typename == 'GraphImage': - urls = [post._node['display_url']] - elif typename == 'GraphVideo': - urls = [post._node['video_url']] - elif typename == 'GraphSidecar': - urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']] - else: - raise NotImplementedError(f'post of type {typename} is currently not supported.') - - return {url : None for url in urls} \ No newline at end of file diff --git a/cisticola/scraper/odysee.py b/cisticola/scraper/odysee.py deleted file mode 100644 index a086470..0000000 --- a/cisticola/scraper/odysee.py +++ /dev/null @@ -1,110 +0,0 @@ -from datetime import datetime, timezone -import json -from typing import Generator -from urllib.parse import urlparse - -import requests -from loguru import logger - -from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info -from polyphemus.api import get_auth_token, get_all_comments -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper.base import Scraper - -class OdyseeScraper(Scraper): - """An implementation of a Scraper for Odysee, using polyphemus library""" - __version__ = "OdyseeScraper 0.0.0" - - def __init__(self): - super().__init__() - self.auth_token = get_auth_token() - - def get_username_from_url(self, url): - - username = url.split('odysee.com/')[-1].strip('@').split(':')[0] - - return username - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - - username = self.get_username_from_url(channel.url) - scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token) - - all_videos = scraper.get_all_videos() - - for video in all_videos: - if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date: - break - - url = video.streaming_url - if url is None: - archived_urls = {} - else: - archived_urls = {url: None} - - raw_comment_info_list = get_all_comments(video_id=video.claim_id) - all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list) - - yield ScraperResult( - scraper=self.__version__, - platform="Odysee", - channel=channel.id, - platform_id=video.claim_id, - date=video.created.replace(tzinfo=timezone.utc), - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video.__dict__, default = str), - archived_urls=archived_urls, - media_archived=None) - - for comment in all_comments: - - yield ScraperResult( - scraper=self.__version__, - platform="Odysee", - channel=channel.id, - platform_id=comment.claim_id, - date=comment.created.replace(tzinfo=timezone.utc), - date_archived=datetime.now(), - raw_data=json.dumps(comment.__dict__, default = str), - archived_urls={}, - media_archived=datetime.now(timezone.utc)) - - @logger.catch - def archive_files(self, result: ScraperResult) -> ScraperResult: - for url in result.archived_urls: - if result.archived_urls[url] is None: - r = requests.head(url) - if r.headers['Content-Type'] == 'text/html; charset=utf-8': - media_blob, content_type, key = self.m3u8_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) - - archived_url = self.archive_blob(media_blob, content_type, key) - result.archived_urls[url] = archived_url - - result.media_archived = datetime.now(timezone.utc) - return result - - def can_handle(self, channel): - if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None: - return True - - def url_to_key(self, url: str, content_type: str) -> str: - key = urlparse(url).path.split('/')[-2] - ext = content_type.split('/')[-1] - - return f'{key}.{ext}' - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - username = self.get_username_from_url(channel.url) - scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token) - profile = scraper.get_entity().__dict__ - - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile, default = str), - date_archived=datetime.now(timezone.utc)) \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py deleted file mode 100644 index 318cb8d..0000000 --- a/cisticola/scraper/twitter.py +++ /dev/null @@ -1,108 +0,0 @@ -from datetime import datetime, timezone -from typing import Generator -from urllib.parse import urlparse, parse_qs -from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo -from loguru import logger -import json - -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper.base import Scraper, ChannelDoesNotExistError - -class TwitterScraper(Scraper): - """An implementation of a Scraper for Twitter, using snscrape library""" - __version__ = "TwitterScraper 0.0.0" - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - if channel.platform_id: - identifier = int(channel.platform_id) - else: - identifier = channel.screenname - - scraper = TwitterProfileScraper(identifier) - - first = True - - for tweet in scraper.get_items(): - if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): - # with TwitterProfileScraper, the first tweet could be an old pinned tweet - if first: - first = False - continue - else: - break - - archived_urls = {} - - media_list = [] - if tweet.media: - media_list += tweet.media - - if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media: - media_list += tweet.retweetedTweet.media - - if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media: - media_list += tweet.quotedTweet.media - - for media in media_list: - if type(media) == Video: - variant = max( - [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate) - url = variant.url - elif type(media) == Gif: - url = media.variants[0].url - elif type(media) == Photo: - url = media.fullUrl - else: - logger.warning(f"Could not get media URL of {media}") - url = None - - if url is not None and url not in archived_urls: - archived_urls[url] = None - - yield ScraperResult( - scraper=self.__version__, - platform="Twitter", - channel=channel.id, - platform_id=tweet.id, - date=tweet.date, - date_archived=datetime.now(timezone.utc), - raw_data=tweet.json(), - archived_urls=archived_urls, - media_archived=None) - - def can_handle(self, channel): - if channel.platform == "Twitter" and (channel.platform_id or channel.screenname): - return True - - def url_to_key(self, url: str, content_type: str) -> str: - parsed_url = urlparse(url) - queries = parse_qs(parsed_url.query) - - ext = '' - - # TODO might require additional statements for other media formats - if 'jpg' in queries.get('format', []): - ext = '.jpg' - elif 'png' in queries.get('format', []): - ext = '.png' - elif parsed_url.path.endswith('.mp4'): - ext = '' - - key = parsed_url.path.split('/')[-1] + ext - return key - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - scraper = TwitterUserScraper(channel.screenname) - entity = scraper._get_entity() - - if entity is None: - raise ChannelDoesNotExistError(channel.url) - else: - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(entity.__dict__, default=str), - date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py deleted file mode 100644 index f7fd7d8..0000000 --- a/cisticola/scraper/vkontakte.py +++ /dev/null @@ -1,107 +0,0 @@ -from datetime import datetime, timezone -from typing import Generator -from urllib.parse import urlparse -import json -import re - -from snscrape.modules.vkontakte import VKontakteUserScraper -from loguru import logger -from yt_dlp.extractor.vk import VKIE - -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper.base import Scraper - -class VkontakteScraper(Scraper): - """An implementation of a Scraper for Vkontakte, using snscrape library""" - __version__ = "VkontakteScraper 0.0.1" - - def get_username_from_url(self, url): - username = url.split('https://vk.com/')[1] - - return username - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - - username = self.get_username_from_url(channel.url) - scraper = VKontakteUserScraper(username) - - first = True - - for post in scraper.get_items(): - if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): - # with VKontakteUserScraper, the first tweet could be an old pinned tweet - if first: - first = False - continue - else: - break - - archived_urls = {} - - if post.photos: - - for photo in post.photos: - variant = max( - [v for v in photo.variants], key=lambda v: v.width * v.height) - url = variant.url - if url is not None: - archived_urls[url] = None - - if post.video: - archived_urls[post.video.url] = None - - yield ScraperResult( - scraper=self.__version__, - platform="VK", - channel=channel.id, - platform_id=post.url.split('/')[-1], - date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), - date_archived=datetime.now(timezone.utc), - raw_data=post.json(), - archived_urls=archived_urls, - media_archived=None) - - @logger.catch - def archive_files(self, result: ScraperResult) -> ScraperResult: - for url in result.archived_urls: - if result.archived_urls[url] is None: - if re.match(VKIE._VALID_URL, url): - # Uses regex from yt_dlp to verify VK video URL - media_blob, content_type, key = self.ytdlp_url_to_blob(url) - else: - media_blob, content_type, key = self.url_to_blob(url) - archived_url = self.archive_blob(media_blob, content_type, key) - result.archived_urls[url] = archived_url - - result.media_archived = datetime.now(timezone.utc) - return result - - - def can_handle(self, channel): - if channel.platform == "VK": - return True - - def url_to_key(self, url: str, content_type: str) -> str: - path = urlparse(url).path - if path.endswith('.jpg'): - key = '_'.join(path.split('/')[-2:]) - else: - ext = '.mp4' - key = path.split('/')[-1] + ext - - return key - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - username = self.get_username_from_url(channel.url) - scraper = VKontakteUserScraper(username) - - profile = scraper._get_entity().__dict__ - - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(profile), - date_archived=datetime.now(timezone.utc)) diff --git a/cisticola/scraper/youtube.py b/cisticola/scraper/youtube.py deleted file mode 100644 index e4210f9..0000000 --- a/cisticola/scraper/youtube.py +++ /dev/null @@ -1,154 +0,0 @@ -from datetime import datetime, timezone -import json -from typing import Generator -import tempfile -from pathlib import Path -import os - -import yt_dlp -from loguru import logger - -from cisticola.base import Channel, ScraperResult, RawChannelInfo -from cisticola.scraper import Scraper - -class YoutubeScraper(Scraper): - """An implementation of a Scraper for Youtube, using youtube-dl""" - __version__ = "YoutubeScraper 0.0.1" - - cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t') - cookiefilename = 'cookiefile.txt' - - @logger.catch - def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: - - content_type = 'video/mp4' - - if since is None: - since_date = datetime.min - start_date = None - else: - since_date = since.date - start_date = since.date.strftime('%Y%m%d') - - with tempfile.TemporaryDirectory() as temp_dir: - - cookiefile = Path(temp_dir)/self.cookiefilename - with open(cookiefile, 'w') as f: - f.write(self.cookiestring) - - daterange = yt_dlp.utils.DateRange(start = start_date) - - ydl_opts = { - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "merge_output_format": "mp4", - "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", - "daterange" : daterange, - "quiet": True, - "verbose": False, - "retries": 5, - "cookiefile": cookiefile} - - ydl = yt_dlp.YoutubeDL(ydl_opts) - - try: - meta = ydl.extract_info( - channel.url, - download=False) - except yt_dlp.utils.DownloadError as e: - raise e - else: - videos = meta['entries'] - valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')] - - for video in valid_videos: - - url = video['webpage_url'] - - archived_urls = {url: None} - - video_id = video["id"] - - yield ScraperResult( - scraper=self.__version__, - platform="Youtube", - channel=channel.id, - platform_id=video_id, - date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), - date_archived=datetime.now(timezone.utc), - raw_data=json.dumps(video, default = str), - archived_urls=archived_urls, - media_archived=None) - - def can_handle(self, channel): - if channel.platform == "Youtube" and channel.url: - return True - - @logger.catch - def archive_files(self, result: ScraperResult) -> ScraperResult: - for url in result.archived_urls: - if result.archived_urls[url] is None: - - media_blob = None - - with tempfile.TemporaryDirectory() as temp_dir: - - cookiefile = Path(temp_dir)/self.cookiefilename - with open(cookiefile, 'w') as f: - f.write(self.cookiestring) - - ydl_opts = { - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "merge_output_format": "mp4", - "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", - "quiet": True, - "verbose": False, - "retries": 5, - "cookiefile": cookiefile} - - ydl = yt_dlp.YoutubeDL(ydl_opts) - - try: - ydl.download(url) - except yt_dlp.utils.DownloadError as e: - raise e - - files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename] - if len(files) != 1: - logger.warning(f'{len(files)} files downloaded for video: {url}') - key = files[0] - with open(Path(temp_dir, key), 'rb') as f: - media_blob = f.read() - - if media_blob is not None: - content_type = 'video/mp4' - archived_url = self.archive_blob(media_blob, content_type, key) - result.archived_urls[url] = archived_url - - result.media_archived = datetime.now(timezone.utc) - return result - - @logger.catch - def get_profile(self, channel: Channel) -> RawChannelInfo: - - ydl_opts = { - "quiet": True, - "verbose": False, - "retries": 5} - - ydl = yt_dlp.YoutubeDL(ydl_opts) - - meta = None - try: - meta = ydl.extract_info( - channel.url, - process=False) - meta.pop('entries') - - return RawChannelInfo(scraper=self.__version__, - platform=channel.platform, - channel=channel.id, - raw_data=json.dumps(meta), - date_archived=datetime.now(timezone.utc)) - - except yt_dlp.utils.DownloadError as e: - raise e diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 48f96bd..67fd430 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,7 +1,5 @@ from .base import ETLController -from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer from .telegram_telethon import TelegramTelethonTransformer from .rumble import RumbleTransformer -from .gettr import GettrTransformer -from .vkontakte import VkontakteTransformer +from .gettr import GettrTransformer \ No newline at end of file diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py deleted file mode 100644 index 52ece05..0000000 --- a/cisticola/transformer/twitter.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -from loguru import logger -from typing import Generator, Union, Callable -import dateutil.parser -from datetime import datetime, timezone - -from cisticola.transformer.base import Transformer -from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel - -class TwitterTransformer(Transformer): - """A Twitter specific ScraperResult, with a method ETL/transforming""" - - __version__ = "TwitterTransformer 0.0.1" - - def can_handle(self, data: ScraperResult) -> bool: - scraper = data.scraper.split(' ') - if scraper[0] == "TwitterScraper": - return True - - return False - - def process_media(self, tweet, post_id, data): - if tweet['media']: - for media in tweet['media']: - orig = None - - if media["_type"] == "snscrape.modules.twitter.Photo": - orig = media["fullUrl"] - elif media["_type"] == "snscrape.modules.twitter.Gif": - orig = media["variants"][0]["url"] - elif media["_type"] == "snscrape.modules.twitter.Video": - variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"]) - orig = variant["url"] - - if orig is None: - logger.warning(f"No media URL found for {media}") - elif orig not in data.archived_urls: - logger.info("Media discovered but not archived") - else: - new = data.archived_urls[orig] - - if media["_type"] == "snscrape.modules.twitter.Photo": - m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig) - else: - m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig) - - yield m - - def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_data) - - transformed = ChannelInfo( - raw_channel_info_id=data.id, - channel=data.channel, - platform_id=raw['id'], - platform=data.platform, - scraper=data.scraper, - transformer=self.__version__, - screenname=raw['username'], - name=raw['displayname'], - description=raw['rawDescription'], - description_url=raw['linkUrl'], - description_location=raw['location'], - followers=raw['followersCount'], - following=raw['friendsCount'], - verified=raw['verified'], - date_created=dateutil.parser.parse(raw['created']), - date_archived=data.date_archived, - date_transformed=datetime.now(timezone.utc) - ) - - transformed = insert(transformed) - - - def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_data) - - transformed = Post( - raw_id=data.id, - platform_id=raw['id'], - scraper=data.scraper, - transformer=self.__version__, - platform=data.platform, - channel=data.channel, - date=dateutil.parser.parse(raw['date']), - date_archived=data.date_archived, - date_transformed=datetime.now(timezone.utc), - url=raw['url'], - content=raw['content'], - author_id=raw['user']['id'], - author_username=raw['user']['username']) - - def subtweet(tweet): - channel = Channel( - name=tweet['user']['displayname'], - platform_id=tweet['user']['id'], - platform=data.platform, - url=tweet['user']['url'], - screenname=tweet['user']['username'], - category='forwarded', - source=self.__version__ - ) - - channel = insert(channel) - - original = Post( - raw_id=data.id, - platform_id=tweet['id'], - scraper=data.scraper, - transformer=self.__version__, - platform=data.platform, - channel=channel.id, - date=dateutil.parser.parse(tweet['date']), - date_archived=data.date_archived, - date_transformed=datetime.now(timezone.utc), - url=tweet['url'], - content=tweet['content'], - author_id=tweet['user']['id'], - author_username=tweet['user']['username'] - ) - - original = insert(original) - transformed.forwarded_from = channel.id - transformed.reply_to = original.id - - media = self.process_media(tweet, original.id, data) - for m in media: - insert(m) - - if raw['retweetedTweet'] is not None: - subtweet(raw['retweetedTweet']) - - if raw['quotedTweet'] is not None: - subtweet(raw['quotedTweet']) - - #insert_post - insert_post(transformed) \ No newline at end of file diff --git a/cisticola/transformer/vkontakte.py b/cisticola/transformer/vkontakte.py deleted file mode 100644 index ad50f94..0000000 --- a/cisticola/transformer/vkontakte.py +++ /dev/null @@ -1,74 +0,0 @@ -import json -from loguru import logger -from typing import Generator, Union, Callable -import dateutil.parser -from datetime import datetime, timezone -from sqlalchemy import func - -from cisticola.transformer.base import Transformer -from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel - -class VkontakteTransformer(Transformer): - """A Vkontakte specific ScraperResult, with a method ETL/transforming""" - - __version__ = "VkontakteTransformer 0.0.1" - - def can_handle(self, data: ScraperResult) -> bool: - scraper = data.scraper.split(' ') - if scraper[0] == "VkontakteScraper": - return True - - return False - - def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_data) - - transformed = ChannelInfo( - raw_channel_info_id=data.id, - channel=data.channel, - platform_id=raw['username'], - platform=data.platform, - scraper=data.scraper, - transformer=self.__version__, - screenname=raw['username'], - name=raw['name'], - description=raw.get('description'), - description_url=raw.get('websites'), - description_location=None, - followers=int(raw['followers']) if raw['followers'] else None, - following=-1, - verified=raw['verified'], - date_archived=data.date_archived, - date_created=None, - date_transformed=datetime.now(timezone.utc) - ) - - transformed = insert(transformed) - - - def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: - raw = json.loads(data.raw_data) - - transformed = Post( - raw_id=data.id, - platform_id=data.platform_id, - scraper=data.scraper, - transformer=self.__version__, - platform=data.platform, - channel=data.channel, - date=data.date, - date_archived=data.date_archived, - date_transformed=datetime.now(timezone.utc), - url=raw['url'], - content=raw['content'] if raw['content'] else '', - author_id = None, - author_username=None, - outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [], - ) - - # insert_post - insert_post(transformed) - - # media = self.process_media(raw, transformed.id, data) - # for m in media: - # insert(m) \ No newline at end of file diff --git a/docs/edit_apidoc.sh b/docs/edit_apidoc.sh index 733010f..92f0ac0 100644 --- a/docs/edit_apidoc.sh +++ b/docs/edit_apidoc.sh @@ -5,7 +5,7 @@ HIDE_COOKIESTRING=" :exclude-members: cookiestring" REPLACE_MAXDEPTH="s/ :maxdepth: 4/ :maxdepth: 1/g" # Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable -for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst +for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst do echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file; done diff --git a/docs/source/cisticola.scraper.gab.rst b/docs/source/cisticola.scraper.gab.rst deleted file mode 100644 index 7bf4059..0000000 --- a/docs/source/cisticola.scraper.gab.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.scraper.gab module -============================ - -.. automodule:: cisticola.scraper.gab - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.scraper.instagram.rst b/docs/source/cisticola.scraper.instagram.rst deleted file mode 100644 index 9288614..0000000 --- a/docs/source/cisticola.scraper.instagram.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.scraper.instagram module -================================== - -.. automodule:: cisticola.scraper.instagram - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.scraper.odysee.rst b/docs/source/cisticola.scraper.odysee.rst deleted file mode 100644 index 8ea022c..0000000 --- a/docs/source/cisticola.scraper.odysee.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.scraper.odysee module -=============================== - -.. automodule:: cisticola.scraper.odysee - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.scraper.rst b/docs/source/cisticola.scraper.rst index a21acc1..6aced75 100644 --- a/docs/source/cisticola.scraper.rst +++ b/docs/source/cisticola.scraper.rst @@ -14,12 +14,6 @@ Submodules cisticola.scraper.base cisticola.scraper.bitchute - cisticola.scraper.gab cisticola.scraper.gettr - cisticola.scraper.instagram - cisticola.scraper.odysee cisticola.scraper.rumble cisticola.scraper.telegram_telethon - cisticola.scraper.twitter - cisticola.scraper.vkontakte - cisticola.scraper.youtube diff --git a/docs/source/cisticola.scraper.twitter.rst b/docs/source/cisticola.scraper.twitter.rst deleted file mode 100644 index be927c8..0000000 --- a/docs/source/cisticola.scraper.twitter.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.scraper.twitter module -================================ - -.. automodule:: cisticola.scraper.twitter - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.scraper.vkontakte.rst b/docs/source/cisticola.scraper.vkontakte.rst deleted file mode 100644 index 5df5c5a..0000000 --- a/docs/source/cisticola.scraper.vkontakte.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.scraper.vkontakte module -================================== - -.. automodule:: cisticola.scraper.vkontakte - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.scraper.youtube.rst b/docs/source/cisticola.scraper.youtube.rst deleted file mode 100644 index 7f8302e..0000000 --- a/docs/source/cisticola.scraper.youtube.rst +++ /dev/null @@ -1,8 +0,0 @@ -cisticola.scraper.youtube module -================================ - -.. automodule:: cisticola.scraper.youtube - :members: - :undoc-members: - :show-inheritance: - :exclude-members: cookiestring diff --git a/docs/source/cisticola.transformer.rst b/docs/source/cisticola.transformer.rst index a6e0f39..4346a98 100644 --- a/docs/source/cisticola.transformer.rst +++ b/docs/source/cisticola.transformer.rst @@ -17,5 +17,3 @@ Submodules cisticola.transformer.gettr cisticola.transformer.rumble cisticola.transformer.telegram_telethon - cisticola.transformer.twitter - cisticola.transformer.vkontakte diff --git a/docs/source/cisticola.transformer.twitter.rst b/docs/source/cisticola.transformer.twitter.rst deleted file mode 100644 index 41a2d99..0000000 --- a/docs/source/cisticola.transformer.twitter.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.transformer.twitter module -==================================== - -.. automodule:: cisticola.transformer.twitter - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/cisticola.transformer.vkontakte.rst b/docs/source/cisticola.transformer.vkontakte.rst deleted file mode 100644 index 3d6c98d..0000000 --- a/docs/source/cisticola.transformer.vkontakte.rst +++ /dev/null @@ -1,7 +0,0 @@ -cisticola.transformer.vkontakte module -====================================== - -.. automodule:: cisticola.transformer.vkontakte - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst index 5d8621e..312cd22 100644 --- a/docs/source/quickstart.rst +++ b/docs/source/quickstart.rst @@ -34,7 +34,7 @@ If you do not already have a Telegram application, you can create one by followi To initialize a Telegram session, run the following script from the package's root directory using the command-line: -.. bash:: +.. code-block:: console bash telethon_session_init.py @@ -43,13 +43,13 @@ Documentation The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory: -.. code-block:: +.. code-block:: console pipenv run make html For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory: -.. code-block:: +.. code-block:: console pipenv run make apidoc