mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-07 19:08:35 +03:00
removed broken scrapers and added basic README
This commit is contained in:
@@ -1,4 +1,10 @@
|
||||
Cisticola
|
||||
==========
|
||||
|
||||
The *cisticola* application enables users to easily collect, process, and analyze large-scale data from several social media platforms.
|
||||
|
||||
It scrapes raw data by coordinating with a set of platform-specific scrapers, archives media attachments, and stores the data in a SQL database.
|
||||
|
||||
For more information about the structure of Cisticola, as well as installation and deployment instructions, see the documentation.
|
||||
|
||||

|
||||
|
||||
@@ -1,12 +1,6 @@
|
||||
from cisticola.utils import make_request
|
||||
from .base import Scraper, ScraperController, ChannelDoesNotExistError
|
||||
from .bitchute import BitchuteScraper
|
||||
from .gab import GabScraper
|
||||
from .gettr import GettrScraper
|
||||
from .instagram import InstagramScraper
|
||||
from .odysee import OdyseeScraper
|
||||
from .rumble import RumbleScraper
|
||||
from .telegram_telethon import TelegramTelethonScraper
|
||||
from .twitter import TwitterScraper
|
||||
from .vkontakte import VkontakteScraper
|
||||
from .youtube import YoutubeScraper
|
||||
from .telegram_telethon import TelegramTelethonScraper
|
||||
@@ -1,108 +0,0 @@
|
||||
from datetime import datetime, timezone, date
|
||||
import json
|
||||
from typing import Generator
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from gabber.client import Client, GAB_API_BASE_URL
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class GabScraper(Scraper):
|
||||
"""An implementation of a Scraper for Gab, using gabber library"""
|
||||
__version__ = "GabScraper 0.0.0"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://gab.com/')[-1]
|
||||
|
||||
return username
|
||||
|
||||
def get_group_id_from_url(self, url):
|
||||
group_id = int(url.split('/')[-1])
|
||||
|
||||
return group_id
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
password = os.environ['GAB_PASS'],
|
||||
threads = 25)
|
||||
|
||||
if channel.url.split('/')[-2] == 'groups':
|
||||
|
||||
group_id = self.get_group_id_from_url(url = channel.url)
|
||||
scraper = client.pull_group_posts(
|
||||
id = group_id,
|
||||
depth = float('inf'))
|
||||
else:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
result = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
user_id = int(result['id'])
|
||||
|
||||
scraper = client.pull_statuses(
|
||||
id = user_id,
|
||||
created_after = date.min,
|
||||
replies = False)
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
for attachment in post.get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
if post.get('reblog') is not None:
|
||||
for attachment in post['reblog'].get('media_attachments'):
|
||||
if attachment.get('type') == 'video':
|
||||
archived_urls[attachment['source_mp4']] = None
|
||||
else:
|
||||
archived_urls[attachment['url']] = None
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gab",
|
||||
channel=channel.id,
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
password = os.environ['GAB_PASS'],
|
||||
threads = 25)
|
||||
|
||||
if channel.url.split('/')[-2] == 'groups':
|
||||
|
||||
group_id = self.get_group_id_from_url(url = channel.url)
|
||||
profile = client.pull_group(id = group_id)
|
||||
|
||||
else:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -1,126 +0,0 @@
|
||||
from typing import Generator, List
|
||||
from datetime import datetime, timezone
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
import instaloader
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
BASE_URL = 'https://www.instagram.com/'
|
||||
|
||||
CONTENT_TYPES = {
|
||||
'jpg' : 'image/jpeg',
|
||||
'mp4' : 'video/mp4'}
|
||||
|
||||
class InstagramScraper(Scraper):
|
||||
"""An implementation of a Scraper for Instagram, using instaloader library"""
|
||||
__version__ = "InstagramScraper 0.0.0"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split(BASE_URL)[1].strip('/')
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
loader = instaloader.Instaloader(
|
||||
quiet = True,
|
||||
download_comments = False,
|
||||
save_metadata = False)
|
||||
|
||||
loader.login(
|
||||
user = os.environ['INSTAGRAM_USERNAME'],
|
||||
passwd = os.environ['INSTAGRAM_PASSWORD'])
|
||||
|
||||
profile = instaloader.Profile.from_username(
|
||||
context = loader.context,
|
||||
username = username)
|
||||
|
||||
for post in profile.get_posts():
|
||||
|
||||
if since is not None and post.date_utc <= since.date:
|
||||
break
|
||||
|
||||
post_url = f'{BASE_URL}p/{post.shortcode}/'
|
||||
|
||||
archived_urls = get_archived_urls_from_post(post = post)
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Instagram",
|
||||
channel=channel.id,
|
||||
platform_id=post.mediaid,
|
||||
date=post.date_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post._asdict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
for comment in post.get_comments():
|
||||
|
||||
comment_dict = comment._asdict()
|
||||
comment_dict['post_url'] = post_url
|
||||
comment_dict['is_comment'] = True
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Instagram",
|
||||
channel=channel.id,
|
||||
platform_id=post.mediaid,
|
||||
date=comment.created_at_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=datetime.now(timezone.utc))
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
loader = instaloader.Instaloader(
|
||||
quiet = True,
|
||||
download_comments = False,
|
||||
save_metadata = False)
|
||||
|
||||
loader.login(
|
||||
user = os.environ['INSTAGRAM_USERNAME'],
|
||||
passwd = os.environ['INSTAGRAM_PASSWORD'])
|
||||
|
||||
user_profile = instaloader.Profile.from_username(
|
||||
context = loader.context,
|
||||
username = username)
|
||||
|
||||
profile = user_profile._asdict()
|
||||
profile['followers'] = user_profile.followers
|
||||
profile['followees'] = user_profile.followees
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
def get_archived_urls_from_post(post: instaloader.Post) -> List[str]:
|
||||
typename = post._node['__typename']
|
||||
if typename == 'GraphImage':
|
||||
urls = [post._node['display_url']]
|
||||
elif typename == 'GraphVideo':
|
||||
urls = [post._node['video_url']]
|
||||
elif typename == 'GraphSidecar':
|
||||
urls = [edge['node']['display_url'] for edge in post._node['edge_sidecar_to_children']['edges']]
|
||||
else:
|
||||
raise NotImplementedError(f'post of type {typename} is currently not supported.')
|
||||
|
||||
return {url : None for url in urls}
|
||||
@@ -1,110 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
|
||||
from polyphemus.api import get_auth_token, get_all_comments
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class OdyseeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Odysee, using polyphemus library"""
|
||||
__version__ = "OdyseeScraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.auth_token = get_auth_token()
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
|
||||
username = url.split('odysee.com/')[-1].strip('@').split(':')[0]
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
|
||||
|
||||
all_videos = scraper.get_all_videos()
|
||||
|
||||
for video in all_videos:
|
||||
if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
|
||||
break
|
||||
|
||||
url = video.streaming_url
|
||||
if url is None:
|
||||
archived_urls = {}
|
||||
else:
|
||||
archived_urls = {url: None}
|
||||
|
||||
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
|
||||
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
platform_id=video.claim_id,
|
||||
date=video.created.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video.__dict__, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
for comment in all_comments:
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
platform_id=comment.claim_id,
|
||||
date=comment.created.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(comment.__dict__, default = str),
|
||||
archived_urls={},
|
||||
media_archived=datetime.now(timezone.utc))
|
||||
|
||||
@logger.catch
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Odysee" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
key = urlparse(url).path.split('/')[-2]
|
||||
ext = content_type.split('/')[-1]
|
||||
|
||||
return f'{key}.{ext}'
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
|
||||
profile = scraper.get_entity().__dict__
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile, default = str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -1,108 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
|
||||
from loguru import logger
|
||||
import json
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
|
||||
|
||||
class TwitterScraper(Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.0"
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
if channel.platform_id:
|
||||
identifier = int(channel.platform_id)
|
||||
else:
|
||||
identifier = channel.screenname
|
||||
|
||||
scraper = TwitterProfileScraper(identifier)
|
||||
|
||||
first = True
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
media_list = []
|
||||
if tweet.media:
|
||||
media_list += tweet.media
|
||||
|
||||
if tweet.retweetedTweet and hasattr(tweet.retweetedTweet, 'media') and tweet.retweetedTweet.media:
|
||||
media_list += tweet.retweetedTweet.media
|
||||
|
||||
if tweet.quotedTweet and hasattr(tweet.quotedTweet, 'media') and tweet.quotedTweet.media:
|
||||
media_list += tweet.quotedTweet.media
|
||||
|
||||
for media in media_list:
|
||||
if type(media) == Video:
|
||||
variant = max(
|
||||
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
||||
url = variant.url
|
||||
elif type(media) == Gif:
|
||||
url = media.variants[0].url
|
||||
elif type(media) == Photo:
|
||||
url = media.fullUrl
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
url = None
|
||||
|
||||
if url is not None and url not in archived_urls:
|
||||
archived_urls[url] = None
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=tweet.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
parsed_url = urlparse(url)
|
||||
queries = parse_qs(parsed_url.query)
|
||||
|
||||
ext = ''
|
||||
|
||||
# TODO might require additional statements for other media formats
|
||||
if 'jpg' in queries.get('format', []):
|
||||
ext = '.jpg'
|
||||
elif 'png' in queries.get('format', []):
|
||||
ext = '.png'
|
||||
elif parsed_url.path.endswith('.mp4'):
|
||||
ext = ''
|
||||
|
||||
key = parsed_url.path.split('/')[-1] + ext
|
||||
return key
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
scraper = TwitterUserScraper(channel.screenname)
|
||||
entity = scraper._get_entity()
|
||||
|
||||
if entity is None:
|
||||
raise ChannelDoesNotExistError(channel.url)
|
||||
else:
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(entity.__dict__, default=str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -1,107 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
import re
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
from yt_dlp.extractor.vk import VKIE
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class VkontakteScraper(Scraper):
|
||||
"""An implementation of a Scraper for Vkontakte, using snscrape library"""
|
||||
__version__ = "VkontakteScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://vk.com/')[1]
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
|
||||
first = True
|
||||
|
||||
for post in scraper.get_items():
|
||||
if since is not None and datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if post.photos:
|
||||
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
if url is not None:
|
||||
archived_urls[url] = None
|
||||
|
||||
if post.video:
|
||||
archived_urls[post.video.url] = None
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="VK",
|
||||
channel=channel.id,
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
@logger.catch
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
if re.match(VKIE._VALID_URL, url):
|
||||
# Uses regex from yt_dlp to verify VK video URL
|
||||
media_blob, content_type, key = self.ytdlp_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "VK":
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
path = urlparse(url).path
|
||||
if path.endswith('.jpg'):
|
||||
key = '_'.join(path.split('/')[-2:])
|
||||
else:
|
||||
ext = '.mp4'
|
||||
key = path.split('/')[-1] + ext
|
||||
|
||||
return key
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
|
||||
profile = scraper._get_entity().__dict__
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -1,154 +0,0 @@
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import yt_dlp
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper import Scraper
|
||||
|
||||
class YoutubeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Youtube, using youtube-dl"""
|
||||
__version__ = "YoutubeScraper 0.0.1"
|
||||
|
||||
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
||||
cookiefilename = 'cookiefile.txt'
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
if since is None:
|
||||
since_date = datetime.min
|
||||
start_date = None
|
||||
else:
|
||||
since_date = since.date
|
||||
start_date = since.date.strftime('%Y%m%d')
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
cookiefile = Path(temp_dir)/self.cookiefilename
|
||||
with open(cookiefile, 'w') as f:
|
||||
f.write(self.cookiestring)
|
||||
|
||||
daterange = yt_dlp.utils.DateRange(start = start_date)
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"daterange" : daterange,
|
||||
"quiet": True,
|
||||
"verbose": False,
|
||||
"retries": 5,
|
||||
"cookiefile": cookiefile}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
download=False)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
videos = meta['entries']
|
||||
valid_videos = [video for video in videos if since_date < datetime.strptime(video['upload_date'], '%Y%m%d')]
|
||||
|
||||
for video in valid_videos:
|
||||
|
||||
url = video['webpage_url']
|
||||
|
||||
archived_urls = {url: None}
|
||||
|
||||
video_id = video["id"]
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Youtube",
|
||||
channel=channel.id,
|
||||
platform_id=video_id,
|
||||
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
if result.archived_urls[url] is None:
|
||||
|
||||
media_blob = None
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
|
||||
cookiefile = Path(temp_dir)/self.cookiefilename
|
||||
with open(cookiefile, 'w') as f:
|
||||
f.write(self.cookiestring)
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"quiet": True,
|
||||
"verbose": False,
|
||||
"retries": 5,
|
||||
"cookiefile": cookiefile}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
ydl.download(url)
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
|
||||
files = [file for file in os.listdir(temp_dir) if file != self.cookiefilename]
|
||||
if len(files) != 1:
|
||||
logger.warning(f'{len(files)} files downloaded for video: {url}')
|
||||
key = files[0]
|
||||
with open(Path(temp_dir, key), 'rb') as f:
|
||||
media_blob = f.read()
|
||||
|
||||
if media_blob is not None:
|
||||
content_type = 'video/mp4'
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"verbose": False,
|
||||
"retries": 5}
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||
|
||||
meta = None
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
channel.url,
|
||||
process=False)
|
||||
meta.pop('entries')
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(meta),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
except yt_dlp.utils.DownloadError as e:
|
||||
raise e
|
||||
@@ -1,7 +1,5 @@
|
||||
from .base import ETLController
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
from .telegram_telethon import TelegramTelethonTransformer
|
||||
from .rumble import RumbleTransformer
|
||||
from .gettr import GettrTransformer
|
||||
from .vkontakte import VkontakteTransformer
|
||||
from .gettr import GettrTransformer
|
||||
@@ -1,137 +0,0 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "TwitterTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "TwitterScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def process_media(self, tweet, post_id, data):
|
||||
if tweet['media']:
|
||||
for media in tweet['media']:
|
||||
orig = None
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
orig = media["fullUrl"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Gif":
|
||||
orig = media["variants"][0]["url"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Video":
|
||||
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
|
||||
orig = variant["url"]
|
||||
|
||||
if orig is None:
|
||||
logger.warning(f"No media URL found for {media}")
|
||||
elif orig not in data.archived_urls:
|
||||
logger.info("Media discovered but not archived")
|
||||
else:
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
m = Image(url=new, post=post_id, raw_id=data.id, original_url=orig)
|
||||
else:
|
||||
m = Video(url=new, post=post_id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['displayname'],
|
||||
description=raw['rawDescription'],
|
||||
description_url=raw['linkUrl'],
|
||||
description_location=raw['location'],
|
||||
followers=raw['followersCount'],
|
||||
following=raw['friendsCount'],
|
||||
verified=raw['verified'],
|
||||
date_created=dateutil.parser.parse(raw['created']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=raw['content'],
|
||||
author_id=raw['user']['id'],
|
||||
author_username=raw['user']['username'])
|
||||
|
||||
def subtweet(tweet):
|
||||
channel = Channel(
|
||||
name=tweet['user']['displayname'],
|
||||
platform_id=tweet['user']['id'],
|
||||
platform=data.platform,
|
||||
url=tweet['user']['url'],
|
||||
screenname=tweet['user']['username'],
|
||||
category='forwarded',
|
||||
source=self.__version__
|
||||
)
|
||||
|
||||
channel = insert(channel)
|
||||
|
||||
original = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=tweet['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=channel.id,
|
||||
date=dateutil.parser.parse(tweet['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=tweet['url'],
|
||||
content=tweet['content'],
|
||||
author_id=tweet['user']['id'],
|
||||
author_username=tweet['user']['username']
|
||||
)
|
||||
|
||||
original = insert(original)
|
||||
transformed.forwarded_from = channel.id
|
||||
transformed.reply_to = original.id
|
||||
|
||||
media = self.process_media(tweet, original.id, data)
|
||||
for m in media:
|
||||
insert(m)
|
||||
|
||||
if raw['retweetedTweet'] is not None:
|
||||
subtweet(raw['retweetedTweet'])
|
||||
|
||||
if raw['quotedTweet'] is not None:
|
||||
subtweet(raw['quotedTweet'])
|
||||
|
||||
#insert_post
|
||||
insert_post(transformed)
|
||||
@@ -1,74 +0,0 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
from sqlalchemy import func
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class VkontakteTransformer(Transformer):
|
||||
"""A Vkontakte specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "VkontakteTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "VkontakteScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['username'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['name'],
|
||||
description=raw.get('description'),
|
||||
description_url=raw.get('websites'),
|
||||
description_location=None,
|
||||
followers=int(raw['followers']) if raw['followers'] else None,
|
||||
following=-1,
|
||||
verified=raw['verified'],
|
||||
date_archived=data.date_archived,
|
||||
date_created=None,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=data.platform_id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=raw['content'] if raw['content'] else '',
|
||||
author_id = None,
|
||||
author_username=None,
|
||||
outlinks =list(filter(None, raw["outlinks"])) if raw['outlinks'] else [],
|
||||
)
|
||||
|
||||
# insert_post
|
||||
insert_post(transformed)
|
||||
|
||||
# media = self.process_media(raw, transformed.id, data)
|
||||
# for m in media:
|
||||
# insert(m)
|
||||
@@ -5,7 +5,7 @@ HIDE_COOKIESTRING=" :exclude-members: cookiestring"
|
||||
REPLACE_MAXDEPTH="s/ :maxdepth: 4/ :maxdepth: 1/g"
|
||||
|
||||
# Remove display of ``cookiestring`` class variable, otherwise Sphinx generates docs containing the value of your cookiestring, based on your ``YOUTUBE_COOKIESTRING`` environment variable
|
||||
for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst cisticola.scraper.youtube.rst
|
||||
for file in cisticola.scraper.base.rst cisticola.scraper.rumble.rst
|
||||
do
|
||||
echo "$HIDE_COOKIESTRING" >> $RST_SOURCE_DIR/$file;
|
||||
done
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.scraper.gab module
|
||||
============================
|
||||
|
||||
.. automodule:: cisticola.scraper.gab
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.scraper.instagram module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.instagram
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.scraper.odysee module
|
||||
===============================
|
||||
|
||||
.. automodule:: cisticola.scraper.odysee
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -14,12 +14,6 @@ Submodules
|
||||
|
||||
cisticola.scraper.base
|
||||
cisticola.scraper.bitchute
|
||||
cisticola.scraper.gab
|
||||
cisticola.scraper.gettr
|
||||
cisticola.scraper.instagram
|
||||
cisticola.scraper.odysee
|
||||
cisticola.scraper.rumble
|
||||
cisticola.scraper.telegram_telethon
|
||||
cisticola.scraper.twitter
|
||||
cisticola.scraper.vkontakte
|
||||
cisticola.scraper.youtube
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.scraper.twitter module
|
||||
================================
|
||||
|
||||
.. automodule:: cisticola.scraper.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.scraper.vkontakte module
|
||||
==================================
|
||||
|
||||
.. automodule:: cisticola.scraper.vkontakte
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -1,8 +0,0 @@
|
||||
cisticola.scraper.youtube module
|
||||
================================
|
||||
|
||||
.. automodule:: cisticola.scraper.youtube
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
:exclude-members: cookiestring
|
||||
@@ -17,5 +17,3 @@ Submodules
|
||||
cisticola.transformer.gettr
|
||||
cisticola.transformer.rumble
|
||||
cisticola.transformer.telegram_telethon
|
||||
cisticola.transformer.twitter
|
||||
cisticola.transformer.vkontakte
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.transformer.twitter module
|
||||
====================================
|
||||
|
||||
.. automodule:: cisticola.transformer.twitter
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -1,7 +0,0 @@
|
||||
cisticola.transformer.vkontakte module
|
||||
======================================
|
||||
|
||||
.. automodule:: cisticola.transformer.vkontakte
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
@@ -34,7 +34,7 @@ If you do not already have a Telegram application, you can create one by followi
|
||||
|
||||
To initialize a Telegram session, run the following script from the package's root directory using the command-line:
|
||||
|
||||
.. bash::
|
||||
.. code-block:: console
|
||||
|
||||
bash telethon_session_init.py
|
||||
|
||||
@@ -43,13 +43,13 @@ Documentation
|
||||
|
||||
The *cisticola* application uses Sphinx_ to generate and display its documentation. To build the documentation in the HTML format, run the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: console
|
||||
|
||||
pipenv run make html
|
||||
|
||||
For developers, if changes are made to the package structure or additional modules are created, you can update the Sphinx source ``*.rst`` files by running the following command from the ``docs/`` directory:
|
||||
|
||||
.. code-block::
|
||||
.. code-block:: console
|
||||
|
||||
pipenv run make apidoc
|
||||
|
||||
|
||||
Reference in New Issue
Block a user