removed broken scrapers and added basic README

This commit is contained in:
Tristan Lee
2023-08-04 09:15:53 -05:00
parent ef9292bc90
commit 30bb4e43e4
23 changed files with 12 additions and 1003 deletions

View File

@@ -1,4 +1,10 @@
Cisticola
==========
The *cisticola* application enables users to easily collect, process, and analyze large-scale data from several social media platforms.
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation.
![Cisticola, the bird](docs/images/cisticola.jpeg)

View File

@@ -1,12 +1,6 @@
from cisticola.utils import make_request
from .base import Scraper, ScraperController, ChannelDoesNotExistError
from .bitchute import BitchuteScraper
from .gab import GabScraper
from .gettr import GettrScraper
from .instagram import InstagramScraper
from .odysee import OdyseeScraper
from .rumble import RumbleScraper
from .telegram_telethon import TelegramTelethonScraper
from .twitter import TwitterScraper
from .vkontakte import VkontakteScraper
from .youtube import YoutubeScraper
from .telegram_telethon import TelegramTelethonScraper

View File

@@ -1,108 +0,0 @@
from datetime import datetime, timezone, date
import json
from typing import Generator
import os
from loguru import logger
from gabber.client import Client, GAB_API_BASE_URL
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
"""An implementation of a Scraper for Gab, using gabber library"""
__version__ = "GabScraper 0.0.0"
def get_username_from_url(self, url):
username = url.split('https://gab.com/')[-1]
return username
def get_group_id_from_url(self, url):
group_id = int(url.split('/')[-1])
return group_id
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
client = Client(
username = os.environ['GAB_USER'],
password = os.environ['GAB_PASS'],
threads = 25)
if channel.url.split('/')[-2] == 'groups':
group_id = self.get_group_id_from_url(url = channel.url)
scraper = client.pull_group_posts(
id = group_id,
depth = float('inf'))
else:
username = self.get_username_from_url(channel.url)
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
user_id = int(result['id'])
scraper = client.pull_statuses(
id = user_id,
created_after = date.min,
replies = False)
for post in scraper:
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
break
archived_urls = {}
for attachment in post.get('media_attachments'):
if attachment.get('type') == 'video':
archived_urls[attachment['source_mp4']] = None
else:
archived_urls[attachment['url']] = None
if post.get('reblog') is not None:
for attachment in post['reblog'].get('media_attachments'):
if attachment.get('type') == 'video':
archived_urls[attachment['source_mp4']] = None
else:
archived_urls[attachment['url']] = None
yield ScraperResult(
scraper=self.__version__,
platform="Gab",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=None)
def can_handle(self, channel: Channel) -> bool:
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = Client(
username = os.environ['GAB_USER'],
password = os.environ['GAB_PASS'],
threads = 25)
if channel.url.split('/')[-2] == 'groups':
group_id = self.get_group_id_from_url(url = channel.url)
profile = client.pull_group(id = group_id)
else:
username = self.get_username_from_url(channel.url)
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,126 +0,0 @@
from typing import Generator, List
from datetime import datetime, timezone
import os
import json
import tempfile
from pathlib import Path
from loguru import logger
import instaloader
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/'
CONTENT_TYPES = {
'jpg' : 'image/jpeg',
'mp4' : 'video/mp4'}
class InstagramScraper(Scraper):
"""An implementation of a Scraper for Instagram, using instaloader library"""
__version__ = "InstagramScraper 0.0.0"
def get_username_from_url(self, url):
username = url.split(BASE_URL)[1].strip('/')
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
loader = instaloader.Instaloader(
quiet = True,
download_comments = False,
save_metadata = False)
loader.login(
user = os.environ['INSTAGRAM_USERNAME'],
passwd = os.environ['INSTAGRAM_PASSWORD'])
profile = instaloader.Profile.from_username(
context = loader.context,
username = username)
for post in profile.get_posts():
if since is not None and post.date_utc <= since.date:
break
post_url = f'{BASE_URL}p/{post.shortcode}/'
archived_urls = get_archived_urls_from_post(post = post)
yield ScraperResult(
scraper=self.__version__,
platform="Instagram",
channel=channel.id,
platform_id=post.mediaid,
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls,
media_archived=None)
for comment in post.get_comments():
comment_dict = comment._asdict()
comment_dict['post_url'] = post_url
comment_dict['is_comment'] = True
yield ScraperResult(
scraper=self.__version__,
platform="Instagram",
channel=channel.id,
platform_id=post.mediaid,
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=datetime.now(timezone.utc))
def can_handle(self, channel):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
loader = instaloader.Instaloader(
quiet = True,
download_comments = False,
save_metadata = False)
loader.login(
user = os.environ['INSTAGRAM_USERNAME'],
passwd = os.environ['INSTAGRAM_PASSWORD'])
user_profile = instaloader.Profile.from_username(
context = loader.context,
username = username)
profile = user_profile._asdict()
profile['followers'] = user_profile.followers
profile['followees'] = user_profile.followees
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
typename = post._node['__typename']
if typename == 'GraphImage':
urls = [post._node['display_url']]
elif typename == 'GraphVideo':
urls = [post._node['video_url']]
elif typename == 'GraphSidecar':
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
else:
raise NotImplementedError(f'post of type {typename} is currently not supported.')
return {url : None for url in urls}

View File

@@ -1,110 +0,0 @@
from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
import requests
from loguru import logger
from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
from polyphemus.api import get_auth_token, get_all_comments
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
"""An implementation of a Scraper for Odysee, using polyphemus library"""
__version__ = "OdyseeScraper 0.0.0"
def __init__(self):
super().__init__()
self.auth_token = get_auth_token()
def get_username_from_url(self, url):
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
all_videos = scraper.get_all_videos()
for video in all_videos:
if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
break
url = video.streaming_url
if url is None:
archived_urls = {}
else:
archived_urls = {url: None}
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=video.claim_id,
date=video.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.__dict__, default = str),
archived_urls=archived_urls,
media_archived=None)
for comment in all_comments:
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=comment.claim_id,
date=comment.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(),
raw_data=json.dumps(comment.__dict__, default = str),
archived_urls={},
media_archived=datetime.now(timezone.utc))
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = datetime.now(timezone.utc)
return result
def can_handle(self, channel):
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
return True
def url_to_key(self, url: str, content_type: str) -> str:
key = urlparse(url).path.split('/')[-2]
ext = content_type.split('/')[-1]
return f'{key}.{ext}'
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
profile = scraper.get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,108 +0,0 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
from loguru import logger
import json
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.0"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
if channel.platform_id:
identifier = int(channel.platform_id)
else:
identifier = channel.screenname
scraper = TwitterProfileScraper(identifier)
first = True
for tweet in scraper.get_items():
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
break
archived_urls = {}
media_list = []
if tweet.media:
media_list += tweet.media
if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
for media in media_list:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None and url not in archived_urls:
archived_urls[url] = None
yield ScraperResult(
scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
archived_urls=archived_urls,
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
return True
def url_to_key(self, url: str, content_type: str) -> str:
parsed_url = urlparse(url)
queries = parse_qs(parsed_url.query)
ext = ''
# TODO might require additional statements for other media formats
if 'jpg' in queries.get('format', []):
ext = '.jpg'
elif 'png' in queries.get('format', []):
ext = '.png'
elif parsed_url.path.endswith('.mp4'):
ext = ''
key = parsed_url.path.split('/')[-1] + ext
return key
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity()
if entity is None:
raise ChannelDoesNotExistError(channel.url)
else:
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(entity.__dict__, default=str),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,107 +0,0 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
import json
import re
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from yt_dlp.extractor.vk import VKIE
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
"""An implementation of a Scraper for Vkontakte, using snscrape library"""
__version__ = "VkontakteScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('https://vk.com/')[1]
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
first = True
for post in scraper.get_items():
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
break
archived_urls = {}
if post.photos:
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
archived_urls[url] = None
if post.video:
archived_urls[post.video.url] = None
yield ScraperResult(
scraper=self.__version__,
platform="VK",
channel=channel.id,
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls,
media_archived=None)
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
if re.match(VKIE._VALID_URL, url):
# Uses regex from yt_dlp to verify VK video URL
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = datetime.now(timezone.utc)
return result
def can_handle(self, channel):
if channel.platform == "VK":
return True
def url_to_key(self, url: str, content_type: str) -> str:
path = urlparse(url).path
if path.endswith('.jpg'):
key = '_'.join(path.split('/')[-2:])
else:
ext = '.mp4'
key = path.split('/')[-1] + ext
return key
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
profile = scraper._get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,154 +0,0 @@
from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
from pathlib import Path
import os
import yt_dlp
from loguru import logger
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.1"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
content_type = 'video/mp4'
if since is None:
since_date = datetime.min
start_date = None
else:
since_date = since.date
start_date = since.date.strftime('%Y%m%d')
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
daterange = yt_dlp.utils.DateRange(start = start_date)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"daterange" : daterange,
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
channel.url,
download=False)
except yt_dlp.utils.DownloadError as e:
raise e
else:
videos = meta['entries']
valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
for video in valid_videos:
url = video['webpage_url']
archived_urls = {url: None}
video_id = video["id"]
yield ScraperResult(
scraper=self.__version__,
platform="Youtube",
channel=channel.id,
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
archived_urls=archived_urls,
media_archived=None)
def can_handle(self, channel):
if channel.platform == "Youtube" and channel.url:
return True
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob = None
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
f.write(self.cookiestring)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
ydl.download(url)
except yt_dlp.utils.DownloadError as e:
raise e
files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
if len(files) != 1:
logger.warning(f'{len(files)} files downloaded for video: {url}')
key = files[0]
with open(Path(temp_dir, key), 'rb') as f:
media_blob = f.read()
if media_blob is not None:
content_type = 'video/mp4'
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = datetime.now(timezone.utc)
return result
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {
"quiet": True,
"verbose": False,
"retries": 5}
ydl = yt_dlp.YoutubeDL(ydl_opts)
meta = None
try:
meta = ydl.extract_info(
channel.url,
process=False)
meta.pop('entries')
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e

View File

@@ -1,7 +1,5 @@
from .base import ETLController
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer
from .vkontakte import VkontakteTransformer
from .gettr import GettrTransformer

View File

@@ -1,137 +0,0 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "TwitterTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "TwitterScraper":
return True
return False
def process_media(self, tweet, post_id, data):
if tweet['media']:
for media in tweet['media']:
orig = None
if media["_type"] == "snscrape.modules.twitter.Photo":
orig = media["fullUrl"]
elif media["_type"] == "snscrape.modules.twitter.Gif":
orig = media["variants"][0]["url"]
elif media["_type"] == "snscrape.modules.twitter.Video":
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
orig = variant["url"]
if orig is None:
logger.warning(f"No media URL found for {media}")
elif orig not in data.archived_urls:
logger.info("Media discovered but not archived")
else:
new = data.archived_urls[orig]
if media["_type"] == "snscrape.modules.twitter.Photo":
m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
else:
m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)
yield m
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['displayname'],
description=raw['rawDescription'],
description_url=raw['linkUrl'],
description_location=raw['location'],
followers=raw['followersCount'],
following=raw['friendsCount'],
verified=raw['verified'],
date_created=dateutil.parser.parse(raw['created']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'],
author_id=raw['user']['id'],
author_username=raw['user']['username'])
def subtweet(tweet):
channel = Channel(
name=tweet['user']['displayname'],
platform_id=tweet['user']['id'],
platform=data.platform,
url=tweet['user']['url'],
screenname=tweet['user']['username'],
category='forwarded',
source=self.__version__
)
channel = insert(channel)
original = Post(
raw_id=data.id,
platform_id=tweet['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=channel.id,
date=dateutil.parser.parse(tweet['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=tweet['url'],
content=tweet['content'],
author_id=tweet['user']['id'],
author_username=tweet['user']['username']
)
original = insert(original)
transformed.forwarded_from = channel.id
transformed.reply_to = original.id
media = self.process_media(tweet, original.id, data)
for m in media:
insert(m)
if raw['retweetedTweet'] is not None:
subtweet(raw['retweetedTweet'])
if raw['quotedTweet'] is not None:
subtweet(raw['quotedTweet'])
#insert_post
insert_post(transformed)

View File

@@ -1,74 +0,0 @@
import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class VkontakteTransformer(Transformer):
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
__version__ = "VkontakteTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "VkontakteScraper":
return True
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['username'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['name'],
description=raw.get('description'),
description_url=raw.get('websites'),
description_location=None,
followers=int(raw['followers']) if raw['followers'] else None,
following=-1,
verified=raw['verified'],
date_archived=data.date_archived,
date_created=None,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=data.platform_id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'] if raw['content'] else '',
author_id = None,
author_username=None,
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
)
# insert_post
insert_post(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)

View File

@@ -5,7 +5,7 @@ HIDE_COOKIESTRING=" :exclude-members: cookiestring"
REPLACE_MAXDEPTH="s/ :maxdepth: 4/ :maxdepth: 1/g"
# Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable
for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst
for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst
do
echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file;
done

View File

@@ -1,7 +0,0 @@
cisticola.scraper.gab module
============================
.. automodule:: cisticola.scraper.gab
:members:
:undoc-members:
:show-inheritance:

View File

@@ -1,7 +0,0 @@
cisticola.scraper.instagram module
==================================
.. automodule:: cisticola.scraper.instagram
:members:
:undoc-members:
:show-inheritance:

View File

@@ -1,7 +0,0 @@
cisticola.scraper.odysee module
===============================
.. automodule:: cisticola.scraper.odysee
:members:
:undoc-members:
:show-inheritance:

View File

@@ -14,12 +14,6 @@ Submodules
cisticola.scraper.base
cisticola.scraper.bitchute
cisticola.scraper.gab
cisticola.scraper.gettr
cisticola.scraper.instagram
cisticola.scraper.odysee
cisticola.scraper.rumble
cisticola.scraper.telegram_telethon
cisticola.scraper.twitter
cisticola.scraper.vkontakte
cisticola.scraper.youtube

View File

@@ -1,7 +0,0 @@
cisticola.scraper.twitter module
================================
.. automodule:: cisticola.scraper.twitter
:members:
:undoc-members:
:show-inheritance:

View File

@@ -1,7 +0,0 @@
cisticola.scraper.vkontakte module
==================================
.. automodule:: cisticola.scraper.vkontakte
:members:
:undoc-members:
:show-inheritance:

View File

@@ -1,8 +0,0 @@
cisticola.scraper.youtube module
================================
.. automodule:: cisticola.scraper.youtube
:members:
:undoc-members:
:show-inheritance:
:exclude-members: cookiestring

View File

@@ -17,5 +17,3 @@ Submodules
cisticola.transformer.gettr
cisticola.transformer.rumble
cisticola.transformer.telegram_telethon
cisticola.transformer.twitter
cisticola.transformer.vkontakte

View File

@@ -1,7 +0,0 @@
cisticola.transformer.twitter module
====================================
.. automodule:: cisticola.transformer.twitter
:members:
:undoc-members:
:show-inheritance:

View File

@@ -1,7 +0,0 @@
cisticola.transformer.vkontakte module
======================================
.. automodule:: cisticola.transformer.vkontakte
:members:
:undoc-members:
:show-inheritance:

View File

@@ -34,7 +34,7 @@ If you do not already have a Telegram application, you can create one by followi
To initialize a Telegram session, run the following script from the package's root directory using the command-line:
.. bash::
.. code-block:: console
bash telethon_session_init.py
@@ -43,13 +43,13 @@ Documentation
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
.. code-block::
.. code-block:: console
pipenv run make html
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
.. code-block::
.. code-block:: console
pipenv run make apidoc