Use new RawChannelInfo class

This commit is contained in:
Logan Williams
2022-03-31 15:17:25 +02:00
parent 61c99d33f6
commit 2dc9213d64
17 changed files with 1698 additions and 79 deletions

View File

@@ -34,7 +34,7 @@ class ScraperResult:
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
raw_posts: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@@ -44,7 +44,7 @@ class ScraperResult:
#: Has the media in this post been archived?
media_archived: bool
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
@@ -89,11 +89,31 @@ class Channel:
def hydrate(self):
pass
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
#: ID number of the scraped post in the ``raw_posts`` table
raw_id: int
#: Platform specific post ID
@@ -144,7 +164,7 @@ class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_data`` table.
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
@@ -221,7 +241,7 @@ class Video(Media):
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
@@ -229,15 +249,23 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('raw_posts', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON),
Column('media_archived', Boolean))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('raw_data', String),
Column('date_archived', DateTime))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('platform_id', String),
Column('category', String),
Column('platform', String),
Column('url', String),
@@ -253,7 +281,7 @@ channel_table = Table('channels', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
@@ -273,7 +301,7 @@ media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
@@ -282,7 +310,8 @@ media_table = Table('media', mapper_registry.metadata,
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -9,7 +9,7 @@ from typing import Generator
import requests
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
@@ -57,7 +57,7 @@ class BitchuteScraper(Scraper):
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -65,7 +65,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
@@ -106,8 +106,12 @@ class BitchuteScraper(Scraper):
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):

View File

@@ -5,7 +5,7 @@ import os
from gabber.client import Client, GAB_API_BASE_URL
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GabScraper(Scraper):
@@ -80,7 +80,7 @@ class GabScraper(Scraper):
platform_id=post['id'],
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -88,7 +88,7 @@ class GabScraper(Scraper):
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = Client(
username = os.environ['GAB_USER'],
@@ -106,4 +106,8 @@ class GabScraper(Scraper):
profile = client._get(GAB_API_BASE_URL + f"/account_by_username/{username}").json()
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
@@ -58,7 +58,7 @@ class GettrScraper(Scraper):
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
raw_posts=json.dumps(post),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -71,9 +71,13 @@ class GettrScraper(Scraper):
key = urlparse(url).path.split('/')[-2] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
client = client = PublicClient()
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from loguru import logger
import instaloader
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
BASE_URL = 'https://www.instagram.com/'
@@ -79,7 +79,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=post.date_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post._asdict(), default=str),
raw_posts=json.dumps(post._asdict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -96,7 +96,7 @@ class InstagramScraper(Scraper):
platform_id=post.mediaid,
date=comment.created_at_utc,
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(comment_dict, default=str),
raw_posts=json.dumps(comment_dict, default=str),
archived_urls={},
media_archived=archive_media)
@@ -104,7 +104,7 @@ class InstagramScraper(Scraper):
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
@@ -125,4 +125,8 @@ class InstagramScraper(Scraper):
profile['followers'] = user_profile.followers
profile['followees'] = user_profile.followees
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -8,7 +8,7 @@ from loguru import logger
from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class OdyseeScraper(Scraper):
@@ -60,7 +60,7 @@ class OdyseeScraper(Scraper):
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
raw_posts=json.dumps(video.info),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -73,7 +73,7 @@ class OdyseeScraper(Scraper):
platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
raw_posts=json.dumps(comment.info),
archived_urls={},
media_archived=True)
@@ -87,10 +87,14 @@ class OdyseeScraper(Scraper):
return f'{key}.{ext}'
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -5,7 +5,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
@@ -39,7 +39,7 @@ class RumbleScraper(Scraper):
platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str),
raw_posts=json.dumps(post, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -52,11 +52,15 @@ class RumbleScraper(Scraper):
if channel.platform == "Rumble" and channel.url is not None:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url = channel.url)
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -128,6 +132,7 @@ def get_channel_profile(url):
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -1,10 +1,10 @@
from typing import Generator
from datetime import datetime, timezone
import json
import snscrape.modules
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class TelegramSnscrapeScraper(Scraper):
@@ -49,15 +49,20 @@ class TelegramSnscrapeScraper(Scraper):
platform_id=post.url,
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media
)
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)
profile = scr._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -11,7 +11,7 @@ from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None:
raw = json.loads(result.raw_data)
raw = json.loads(result.raw_posts)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
@@ -141,11 +141,11 @@ class TelegramTelethonScraper(Scraper):
platform_id=post_url,
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
raw_posts=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media)
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
@@ -157,4 +157,8 @@ class TelegramTelethonScraper(Scraper):
full_channel = client(GetFullChannelRequest(channel = username))
profile = full_channel.__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,11 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse, parse_qs
from snscrape.modules.twitter import TwitterProfileScraper, TwitterUserScraper, Video, Gif, Photo
from loguru import logger
import json
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper, ChannelDoesNotExistError
class TwitterScraper(Scraper):
@@ -66,7 +66,7 @@ class TwitterScraper(Scraper):
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(timezone.utc),
raw_data=tweet.json(),
raw_posts=tweet.json(),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -91,7 +91,7 @@ class TwitterScraper(Scraper):
key = parsed_url.path.split('/')[-1] + ext
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
scraper = TwitterUserScraper(channel.screenname)
entity = scraper._get_entity()
@@ -99,4 +99,8 @@ class TwitterScraper(Scraper):
if entity is None:
raise ChannelDoesNotExistError(channel.url)
else:
return entity.__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(emtity.__dict__),
date_archived=datetime.now(timezone.utc))

View File

@@ -1,11 +1,10 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
@@ -62,7 +61,7 @@ class VkontakteScraper(Scraper):
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
raw_posts=post.json(),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -80,10 +79,15 @@ class VkontakteScraper(Scraper):
return key
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
profile = scraper._get_entity().__dict__
return profile
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))

View File

@@ -2,10 +2,9 @@ from datetime import datetime, timezone
import json
from typing import Generator
import tempfile
import yt_dlp
from cisticola.base import Channel, ScraperResult
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper
class YoutubeScraper(Scraper):
@@ -71,7 +70,7 @@ class YoutubeScraper(Scraper):
platform_id=video_id,
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video, default = str),
raw_posts=json.dumps(video, default = str),
archived_urls=archived_urls,
media_archived=archive_media)
@@ -79,8 +78,7 @@ class YoutubeScraper(Scraper):
if channel.platform == "Youtube" and channel.url:
return True
def get_profile(self, channel: Channel) -> dict:
def get_profile(self, channel: Channel) -> RawChannelInfo:
ydl_opts = {}
ydl = yt_dlp.YoutubeDL(ydl_opts)
@@ -89,7 +87,12 @@ class YoutubeScraper(Scraper):
meta = ydl.extract_info(
channel.url,
process=False)
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(meta),
date_archived=datetime.now(timezone.utc))
except yt_dlp.utils.DownloadError as e:
raise e
return meta

View File

@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
orig = raw['video_url']
new = data.archived_urls[orig]
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
yield m
def transform(self, data: ScraperResult) -> Post:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text

View File

@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
raw = json.loads(data.raw_posts)
transformed = Post(
raw_id=data.id,