Use new RawChannelInfo class

This commit is contained in:
Logan Williams
2022-03-31 15:17:25 +02:00
parent 61c99d33f6
commit 2dc9213d64
17 changed files with 1698 additions and 79 deletions

View File

@@ -23,6 +23,8 @@ gspread = "*"
cryptg = "*" cryptg = "*"
gabber = {git = "https://github.com/stanfordio/gabber.git"} gabber = {git = "https://github.com/stanfordio/gabber.git"}
psycopg2-binary = "*" psycopg2-binary = "*"
tqdm = "*"
ratelimit = "*"
[dev-packages] [dev-packages]
pytest = "*" pytest = "*"
@@ -34,7 +36,7 @@ sphinx = "*"
sphinx_rtd_theme = "*" sphinx_rtd_theme = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.8"
[pipenv] [pipenv]
allow_prereleases = true allow_prereleases = true

1532
Pipfile.lock generated Normal file

File diff suppressed because it is too large Load Diff

33
app.py
View File

@@ -3,6 +3,8 @@ from loguru import logger
import gspread import gspread
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import os
import time
from cisticola.base import Channel, mapper_registry from cisticola.base import Channel, mapper_registry
from cisticola.scraper import ( from cisticola.scraper import (
@@ -19,7 +21,7 @@ from cisticola.scraper import (
def sync_channels(args): def sync_channels(args):
logger.info("Synchronizing channels") logger.info("Synchronizing channels")
session = get_db_session(args) session = get_db_session()
gc = gspread.service_account(filename='service_account.json') gc = gspread.service_account(filename='service_account.json')
@@ -29,6 +31,7 @@ def sync_channels(args):
row = 2 row = 2
for c in channels: for c in channels:
logger.info(c)
del c['id'] del c['id']
del c['followers'] del c['followers']
@@ -43,20 +46,29 @@ def sync_channels(args):
# check to see if this already exists, # check to see if this already exists,
channel = session.query(Channel).filter_by(platform_id=None if c['platform_id'] == '' else c['platform_id'], platform=c['platform'], url=c['url']).first() platform_id = None
if c['platform_id'] != '':
platform_id = c['platform_id']
channel = session.query(Channel).filter_by(platform_id=platform_id, platform=c['platform'], url=c['url']).first()
logger.info(channel)
if not channel: if not channel:
channel = Channel(**c, source='researcher') channel = Channel(**c, source='researcher')
logger.debug(f"{channel} does not exist, adding")
session.add(channel) session.add(channel)
session.flush() session.flush()
session.commit()
wks.update_cell(row, 1, channel.id) wks.update_cell(row, 1, channel.id)
time.sleep(1)
row += 1 row += 1
session.commit() session.commit()
def get_db_session(args): def get_db_session():
engine = create_engine(args.db) engine = create_engine(os.environ['DB'])
session_generator = sessionmaker() session_generator = sessionmaker()
session_generator.configure(bind=engine) session_generator.configure(bind=engine)
@@ -64,8 +76,8 @@ def get_db_session(args):
return session return session
def get_scraper_controller(args): def get_scraper_controller():
engine = create_engine(args.db) engine = create_engine(os.environ['DB'])
controller = ScraperController() controller = ScraperController()
controller.connect_to_db(engine) controller.connect_to_db(engine)
@@ -90,8 +102,8 @@ def archive_media(args):
controller = get_scraper_controller(args) controller = get_scraper_controller(args)
controller.archive_unarchived_media() controller.archive_unarchived_media()
def init_db(args): def init_db():
engine = create_engine(args.db) engine = create_engine(os.environ['DB'])
mapper_registry.metadata.create_all(bind=engine) mapper_registry.metadata.create_all(bind=engine)
if __name__ == '__main__': if __name__ == '__main__':
@@ -99,14 +111,13 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Cisticola command line tools') parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"') parser.add_argument('command', type=str, help='Command to run: "sync-channels", "scrape-channels", or "archive-media"')
parser.add_argument('--db', type=str, help='[*] Sqlalchemy database string, eg, "sqlite:///cisticola.db"')
parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize') parser.add_argument('--gsheet', type=str, help='[sync-channels] URL of Google Sheet to synchronize')
parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media') parser.add_argument('--media', action='store_true', help='[scrape-channels] Add this flag to media')
args = parser.parse_args() args = parser.parse_args()
if args.command == 'init-db': if args.command == 'init-db':
init_db(args) init_db()
elif args.command == 'sync-channels': elif args.command == 'sync-channels':
sync_channels(args) sync_channels(args)
elif args.command == 'scrape-channels': elif args.command == 'scrape-channels':

View File

@@ -34,7 +34,7 @@ class ScraperResult:
date: datetime date: datetime
#: JSON dump of dict that contains all data scraped for the post. #: JSON dump of dict that contains all data scraped for the post.
raw_data: str raw_posts: str
#: Datetime (relative to UTC) that the scraped post was archived at. #: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime date_archived: datetime
@@ -44,7 +44,7 @@ class ScraperResult:
#: Has the media in this post been archived? #: Has the media in this post been archived?
media_archived: bool media_archived: bool
@dataclass @dataclass
class Channel: class Channel:
"""Information about a specific channel to be scraped. """Information about a specific channel to be scraped.
@@ -89,11 +89,31 @@ class Channel:
def hydrate(self): def hydrate(self):
pass pass
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass @dataclass
class Post: class Post:
"""An object with fields for columns in the analysis table""" """An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table #: ID number of the scraped post in the ``raw_posts`` table
raw_id: int raw_id: int
#: Platform specific post ID #: Platform specific post ID
@@ -144,7 +164,7 @@ class Media:
"""Base class for organizing information about a media file. """Base class for organizing information about a media file.
""" """
#: ID number of the media's corresponding scraped post in the ``raw_data`` table. #: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table. #: ID number of the media's corresponging scraped post in the ``analysis`` table.
@@ -221,7 +241,7 @@ class Video(Media):
mapper_registry = registry() mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata, raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
Column('scraper', String), Column('scraper', String),
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id')), Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String), Column('platform_id', String),
Column('date', DateTime), Column('date', DateTime),
Column('raw_data', String), Column('raw_posts', String),
Column('date_archived', DateTime), Column('date_archived', DateTime),
Column('archived_urls', JSON), Column('archived_urls', JSON),
Column('media_archived', Boolean)) Column('media_archived', Boolean))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('raw_data', String),
Column('date_archived', DateTime))
channel_table = Table('channels', mapper_registry.metadata, channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True), Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String), Column('name', String),
Column('platform_id', Integer), Column('platform_id', String),
Column('category', String), Column('category', String),
Column('platform', String), Column('platform', String),
Column('url', String), Column('url', String),
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata, post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')), Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('platform_id', Integer), Column('platform_id', Integer),
Column('scraper', String), Column('scraper', String),
Column('transformer', String), Column('transformer', String),
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True, Column('id', Integer, primary_key=True,
autoincrement=True), autoincrement=True),
Column('type', String), Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')), Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('post', Integer, ForeignKey('posts.id')), Column('post', Integer, ForeignKey('posts.id')),
Column('url', String), Column('url', String),
Column('original_url', String), Column('original_url', String),
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
mapper_registry.map_imperatively(Post, post_table) mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table) mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table) mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -9,7 +9,7 @@ from typing import Generator
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper): class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
platform_id=post['id'], platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']), date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post), raw_posts=json.dumps(post),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True return True
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url base_url = channel.url
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
'subscribers': counts['subscriber_count'], 'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])} 'views': int(counts['about_view_count'].split(' ')[0])}
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True): def strip_tags(html, convert_newlines=True):

View File

@@ -5,7 +5,7 @@ import os
from gabber.client import Client, GAB_API_BASE_URL from gabber.client import Client, GAB_API_BASE_URL
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class GabScraper(Scraper): class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
platform_id=post['id'], platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc), date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post), raw_posts=json.dumps(post),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None: if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True return True
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
client = Client( client = Client(
username = os.environ['GAB_USER'], username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json() profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from gogettr import PublicClient from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class GettrScraper(Scraper): class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
platform_id=post['_id'], platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.), date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post), raw_posts=json.dumps(post),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext key = urlparse(url).path.split('/')[-2] + ext
return key return key
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient() client = client = PublicClient()
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
profile = client.user_info(username) profile = client.user_info(username)
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from loguru import logger from loguru import logger
import instaloader import instaloader
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/' BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid, platform_id=post.mediaid,
date=post.date_utc, date=post.date_utc,
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str), raw_posts=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid, platform_id=post.mediaid,
date=comment.created_at_utc, date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str), raw_posts=json.dumps(comment_dict, default=str),
archived_urls={}, archived_urls={},
media_archived=archive_media) media_archived=archive_media)
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None: if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True return True
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
profile['followers'] = user_profile.followers profile['followers'] = user_profile.followers
profile['followees'] = user_profile.followees profile['followees'] = user_profile.followees
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from loguru import logger
from polyphemus.base import OdyseeChannel from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token from polyphemus.api import get_auth_token
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper): class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
platform_id=video.info['claim_id'], platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']), date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info), raw_posts=json.dumps(video.info),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
platform_id=comment.info['claim_id'], platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']), date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(), date_archived=datetime.now(),
raw_data=json.dumps(comment.info), raw_posts=json.dumps(comment.info),
archived_urls={}, archived_urls={},
media_archived=True) media_archived=True)
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
return f'{key}.{ext}' return f'{key}.{ext}'
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token) odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info profile = odysee_channel.info
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com' BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
platform_id=post['media_url'].split('/')[-2], platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc), date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str), raw_posts=json.dumps(post, default = str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
if channel.platform == "Rumble" and channel.url is not None: if channel.platform == "Rumble" and channel.url is not None:
return True return True
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url = channel.url) profile = get_channel_profile(url = channel.url)
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -128,6 +132,7 @@ def get_channel_profile(url):
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None, 'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,10 +1,10 @@
from typing import Generator from typing import Generator
from datetime import datetime, timezone from datetime import datetime, timezone
import json
import snscrape.modules import snscrape.modules
from loguru import logger from loguru import logger
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper): class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
platform_id=post.url, platform_id=post.url,
date=post.date, date=post.date,
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=post.json(), raw_posts=post.json(),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media media_archived=archive_media
) )
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
scr = snscrape.modules.telegram.TelegramChannelScraper( scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname) channel.screenname)
profile = scr._get_entity().__dict__ profile = scr._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types from telethon.tl import types
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
key = list(result.archived_urls.keys())[0] key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None: if result.archived_urls[key] is None:
raw = json.loads(result.raw_data) raw = json.loads(result.raw_posts)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
platform_id=post_url, platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc), date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str), raw_posts=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
full_channel = client(GetFullChannelRequest(channel = username)) full_channel = client(GetFullChannelRequest(channel = username))
profile = full_channel.__dict__ profile = full_channel.__dict__
return profile return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,11 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Generator from typing import Generator
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
from loguru import logger from loguru import logger
import json
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper): class TwitterScraper(Scraper):
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
platform_id=tweet.id, platform_id=tweet.id,
date=tweet.date, date=tweet.date,
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(), raw_posts=tweet.json(),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
key = parsed_url.path.split('/')[-1] + ext key = parsed_url.path.split('/')[-1] + ext
return key return key
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
scraper = TwitterUserScraper(channel.screenname) scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity() entity = scraper._get_entity()
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
if entity is None: if entity is None:
raise ChannelDoesNotExistError(channel.url) raise ChannelDoesNotExistError(channel.url)
else: else:
return entity.__dict__ return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(emtity.__dict__),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,10 @@
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Generator from typing import Generator
from urllib.parse import urlparse from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger from loguru import logger
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper): class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
platform_id=post.url.split('/')[-1], platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=post.json(), raw_posts=post.json(),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
return key return key
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username) scraper = VKontakteUserScraper(username)
profile = scraper._get_entity().__dict__ profile = scraper._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,10 +2,9 @@ from datetime import datetime, timezone
import json import json
from typing import Generator from typing import Generator
import tempfile import tempfile
import yt_dlp import yt_dlp
from cisticola.base import Channel, ScraperResult from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper from cisticola.scraper import Scraper
class YoutubeScraper(Scraper): class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
platform_id=video_id, platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc), date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str), raw_posts=json.dumps(video, default = str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=archive_media) media_archived=archive_media)
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url: if channel.platform == "Youtube" and channel.url:
return True return True
def get_profile(self, channel: Channel) -> dict: def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {} ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info( meta = ydl.extract_info(
channel.url, channel.url,
process=False) process=False)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e: except yt_dlp.utils.DownloadError as e:
raise e raise e
return meta

View File

@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
return False return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_posts)
orig = raw['video_url'] orig = raw['video_url']
new = data.archived_urls[orig] new = data.archived_urls[orig]
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
yield m yield m
def transform(self, data: ScraperResult) -> Post: def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_posts)
soup = BeautifulSoup(raw['body'], features = 'html.parser') soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text content = soup.find_all('p')[-1].text

View File

@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_posts)
transformed = Post( transformed = Post(
raw_id=data.id, raw_id=data.id,