formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue

This commit is contained in:
Tristan Lee
2023-08-04 14:51:00 -05:00
parent 070ee3391d
commit fab65a5d67
25 changed files with 3043 additions and 2176 deletions

10
.github/workflows/black.yml vendored Normal file
View File

@@ -0,0 +1,10 @@
name: Lint
on: [push]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable

6
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,6 @@
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9

View File

@@ -24,6 +24,8 @@ ratelimit = "*"
pytz = "*"
langdetect = "*"
spacy = "==3.2.4"
# Temporary fix for https://github.com/explosion/spaCy/issues/12659
typing_extensions = "==4.4.0"
ocrd-pyexiftool = "*"
filelock = "*"
telethon = "*"
@@ -38,6 +40,7 @@ pytest-metadata = "*"
black = "*"
Sphinx = "*"
sphinx-rtd-theme = "*"
pre-commit = "*"
[requires]
python_version = "3.9"

2318
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

92
app.py
View File

@@ -10,7 +10,6 @@ import sys
from cisticola.base import mapper_registry
from cisticola.scraper import (
ScraperController,
# VkontakteScraper,
TelegramTelethonScraper,
GettrScraper,
BitchuteScraper,
@@ -22,11 +21,11 @@ from cisticola.transformer import (
GettrTransformer,
RumbleTransformer,
BitchuteTransformer,
# VkontakteTransformer,
)
from sync_with_gsheet import sync_channels
def get_db_session():
engine = create_engine(os.environ["DB"])
@@ -48,16 +47,18 @@ def get_scraper_controller(args):
else:
telethon_session_name = None
scrapers = [ #VkontakteScraper(),
TelegramTelethonScraper(telethon_session_name = telethon_session_name),
scrapers = [ # VkontakteScraper(),
TelegramTelethonScraper(telethon_session_name=telethon_session_name),
GettrScraper(),
BitchuteScraper(),
RumbleScraper()]
RumbleScraper(),
]
controller.register_scrapers(scrapers)
return controller
def get_transformer_controller(args):
engine = create_engine(os.environ["DB"])
@@ -69,11 +70,12 @@ def get_transformer_controller(args):
else:
telethon_session_name = None
transformers = [ #VkontakteTransformer(),
TelegramTelethonTransformer(telethon_session_name = telethon_session_name),
transformers = [ # VkontakteTransformer(),
TelegramTelethonTransformer(telethon_session_name=telethon_session_name),
GettrTransformer(),
BitchuteTransformer(),
RumbleTransformer()]
RumbleTransformer(),
]
controller.register_transformers(transformers)
@@ -86,12 +88,14 @@ def scrape_channels(args):
controller = get_scraper_controller(args)
controller.scrape_all_channels()
def scrape_channels_old(args):
logger.info(f"Scraping old posts from channels")
controller = get_scraper_controller(args)
controller.scrape_all_channels(fetch_old=True)
def scrape_channel_info(args):
logger.info(f"Scraping channel info")
@@ -103,12 +107,13 @@ def archive_media(args):
logger.info(f"Archiving unarchived media")
controller = get_scraper_controller(args)
if args.chronological:
controller.archive_unarchived_media(chronological=True)
else:
controller.archive_unarchived_media()
def transform(args):
logger.info(f"Transforming untransformed posts")
@@ -118,9 +123,10 @@ def transform(args):
min_date = datetime.datetime.fromisoformat(args.min_date)
else:
min_date = datetime.datetime(1970, 1, 1)
controller.transform_all_untransformed(min_date=min_date)
def transform_info(args):
logger.info(f"Transforming untransformed channel info")
@@ -129,12 +135,14 @@ def transform_info(args):
# sync_channels(args, get_db_session())
def transform_media(args):
logger.info(f"Transforming untransformed channel media")
controller = get_transformer_controller(args)
controller.transform_all_untransformed_media()
def init_db():
engine = create_engine(os.environ["DB"])
mapper_registry.metadata.create_all(bind=engine)
@@ -162,29 +170,77 @@ if __name__ == "__main__":
if args.command == "init-db":
init_db()
elif args.command == "sync-channels":
logger.add("logs/sync-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/sync-channels.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
sync_channels(args, get_db_session())
elif args.command == "scrape-channels":
logger.add("logs/scrape-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/scrape-channels.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channels(args)
elif args.command == "scrape-channels-old":
logger.add("logs/scrape-channels-old.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/scrape-channels-old.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channels_old(args)
elif args.command == "archive-media":
logger.add("logs/archive-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/archive-media.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
archive_media(args)
elif args.command == "channel-info":
logger.add("logs/channel-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/channel-info.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channel_info(args)
elif args.command == "transform":
logger.add("logs/transform.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/transform.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
logger.add("logs/transform_trace.log", level="TRACE", retention="7 days")
transform(args)
elif args.command == "transform-info":
logger.add("logs/transform-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/transform-info.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
transform_info(args)
elif args.command == "transform-media":
logger.add("logs/transform-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
logger.add(
"logs/transform-media.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
transform_media(args)
else:
logger.error(f"Unrecognized command {args.command}")

View File

@@ -1,3 +1,3 @@
from . import base
from . import scraper
from . import base
from . import scraper
from . import transformer

View File

@@ -1,12 +1,22 @@
from typing import List
from dataclasses import dataclass, field
from datetime import datetime
import tempfile
import tempfile
import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index
from sqlalchemy import (
Table,
Column,
Integer,
String,
JSON,
DateTime,
ForeignKey,
Boolean,
Index,
)
from sqlalchemy.dialects.postgresql import JSONB
import pytesseract
import PIL
@@ -22,10 +32,10 @@ from .utils import make_request
# Disable decompression bomb check
PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
@dataclass
class ScraperResult:
"""Minimally processed set of information from a scraper about one post
"""
"""Minimally processed set of information from a scraper about one post"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
@@ -48,16 +58,16 @@ class ScraperResult:
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
#: What date was the media archived? (None if not archived)
media_archived: datetime
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
"""Information about a specific channel to be scraped."""
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
@@ -76,32 +86,32 @@ class Channel:
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str = None
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str = None
#: Whether or not the channel is publicly-accessible.
#: Whether or not the channel is publicly-accessible.
public: bool = None
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool = None
#: Any other additional notes about the channel.
notes: str = ""
#: Did the channel come from a researcher or a scraping process?
source: str = None
def hydrate(self):
pass
@dataclass
class RawChannelInfo:
"""Minimally processed set of information from a scraper about one channel
"""
"""Minimally processed set of information from a scraper about one channel"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
@@ -118,10 +128,10 @@ class RawChannelInfo:
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class ChannelInfo:
"""A processed set of information about a channel.
"""
"""A processed set of information about a channel."""
# Foreign key from the raw_channel_info table
raw_channel_info_id: int
@@ -154,20 +164,22 @@ class ChannelInfo:
#: Datetime (relative to UTC) that the scraped channel info was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped channel info was transformed at.
date_transformed: datetime
def hydrate(self):
pass
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_fr = spacy.load('fr_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_xx = spacy.load('xx_ent_wiki_sm')
nlp_en = spacy.load("en_core_web_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_de = spacy.load("de_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_it = spacy.load("it_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_fr = spacy.load("fr_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_ru = spacy.load("ru_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_nl = spacy.load("nl_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_xx = spacy.load("xx_ent_wiki_sm")
@dataclass
class Post:
@@ -175,7 +187,7 @@ class Post:
#: ID number of the scraped post in the ``raw_posts`` table
raw_id: int
#: Platform specific post ID
platform_id: str
@@ -199,16 +211,16 @@ class Post:
#: Datetime (UTC) that the scraped post was transformed at.
date_transformed: datetime
#: URL of the original post
url: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
#: Text of the original post
content: str
@@ -232,7 +244,7 @@ class Post:
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to
reply_to: int = None
@@ -258,21 +270,25 @@ class Post:
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", "").replace("........", ""))
urls = re.findall(
URL_REGEX, self.content.replace("::::::::", "").replace("........", "")
)
self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks))
self.outlinks = list(set(outlink for outlink in self.outlinks))
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags += hashtags
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
BTC_REGEX = r"\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b"
ETHER_REGEX = r"(0x[a-fA-F0-9]{40})"
self.cryptocurrency_addresses = [
m[0] for m in re.findall(BTC_REGEX, self.content)
] + re.findall(ETHER_REGEX, self.content)
try:
self.detected_language = detect(self.content)
@@ -287,39 +303,56 @@ class Post:
def hydrate_spacy(self):
ner_only = False
if self.detected_language == 'en':
if self.detected_language == "en":
nlp = nlp_en
elif self.detected_language == 'de':
elif self.detected_language == "de":
nlp = nlp_de
elif self.detected_language == 'it':
elif self.detected_language == "it":
nlp = nlp_it
elif self.detected_language == 'fr':
elif self.detected_language == "fr":
nlp = nlp_fr
elif self.detected_language == 'ru':
elif self.detected_language == "ru":
nlp = nlp_ru
elif self.detected_language == 'nl':
elif self.detected_language == "nl":
nlp = nlp_nl
else:
nlp = nlp_xx
ner_only = True
doc = nlp(self.content)
if not ner_only:
punctuation = ['?',':','!',',','.',';','|','(',')','--','#','=','+']
tokens = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation]
self.normalized_content = ' '.join(tokens)
else:
self.normalized_content = ''
self.named_entities = [{'text': ent.text, 'type': ent.label_} for ent in doc.ents]
if not ner_only:
punctuation = [
"?",
":",
"!",
",",
".",
";",
"|",
"(",
")",
"--",
"#",
"=",
"+",
]
tokens = [
t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation
]
self.normalized_content = " ".join(tokens)
else:
self.normalized_content = ""
self.named_entities = [
{"text": ent.text, "type": ent.label_} for ent in doc.ents
]
@dataclass
class Media:
"""Base class for organizing information about a media file.
"""
"""Base class for organizing information about a media file."""
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int
@@ -355,16 +388,14 @@ class Media:
exif: str = None
def get_blob(self):
"""Download media file as bytes blob.
"""
"""Download media file as bytes blob."""
blob = make_request(self.url)
return blob.content
@logger.catch
def hydrate(self, blob = None):
"""Download media file as bytes blob and extract data from content.
"""
def hydrate(self, blob=None):
"""Download media file as bytes blob and extract data from content."""
if blob is None:
blob = self.get_blob()
@@ -372,8 +403,7 @@ class Media:
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
"""Extract Exif metadata from bytes blob.
"""
"""Extract Exif metadata from bytes blob."""
with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(blob)
@@ -382,17 +412,17 @@ class Media:
exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif)
@dataclass
class Image(Media):
"""Class for organizing information about an image file.
"""
"""Class for organizing information about an image file."""
#: Extracted OCR content from image
ocr: str = None
@logger.catch
def hydrate(self, blob=None):
"""Download image file as bytes blob and extract Exif and OCR content
"""Download image file as bytes blob and extract Exif and OCR content
from the image.
"""
@@ -403,143 +433,180 @@ class Image(Media):
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob.
"""
"""Extract OCR (optical character recognition) data from image bytes blob."""
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
"""Class for organizing information about an video file.
"""
"""Class for organizing information about an video file."""
pass
@dataclass
class Audio(Media):
"""Class for organizing information about an audio file.
"""
"""Class for organizing information about an audio file."""
pass
mapper_registry = registry()
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String, index=True),
Column('date', DateTime, index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True),
Column('archived_urls', JSON),
Column('media_archived', DateTime, index=True))
raw_posts_table = Table(
"raw_posts",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("scraper", String),
Column("platform", String),
Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column("platform_id", String, index=True),
Column("date", DateTime, index=True),
Column("raw_data", String),
Column("date_archived", DateTime, index=True),
Column("archived_urls", JSON),
Column("media_archived", DateTime, index=True),
)
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True))
raw_channel_info_table = Table(
"raw_channel_info",
mapper_registry.metadata,
Column("id", Integer, primary_key=True),
Column("scraper", String),
Column("platform", String),
Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column("raw_data", String),
Column("date_archived", DateTime, index=True),
)
channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('screenname', String),
Column('name', String),
Column('description', String),
Column('description_url', String),
Column('description_location', String),
Column('followers', Integer),
Column('following', Integer),
Column('verified', Boolean),
Column('date_created', DateTime),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
)
channel_info_table = Table(
"channel_info",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column(
"raw_channel_info_id", Integer, ForeignKey("raw_channel_info.id"), index=True
),
Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column("platform_id", String),
Column("scraper", String),
Column("transformer", String),
Column("platform", String),
Column("screenname", String),
Column("name", String),
Column("description", String),
Column("description_url", String),
Column("description_location", String),
Column("followers", Integer),
Column("following", Integer),
Column("verified", Boolean),
Column("date_created", DateTime),
Column("date_archived", DateTime, index=True),
Column("date_transformed", DateTime, index=True),
)
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', String),
Column('category', String),
Column('platform', String),
Column('url', String),
Column('screenname', String),
Column('country', JSONB, index = True),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
)
channel_table = Table(
"channels",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("name", String),
Column("platform_id", String),
Column("category", String),
Column("platform", String),
Column("url", String),
Column("screenname", String),
Column("country", JSONB, index=True),
Column("influencer", String),
Column("public", Boolean),
Column("chat", Boolean),
Column("notes", String),
Column("source", String),
)
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', String, index=True),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
Column('reply_to', Integer, ForeignKey('posts.id'), index=True),
Column('named_entities', JSON),
Column('cryptocurrency_addresses', JSON),
Column('hashtags', JSON),
Column('outlinks', JSON),
Column('mentions', JSON),
Column('likes', Integer),
Column('forwards', Integer),
Column('views', Integer),
Column('video_title', String),
Column('video_duration', Integer),
Column('detected_language', String, index = True),
Column('normalized_content', String)
)
post_table = Table(
"posts",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
Column("platform_id", String, index=True),
Column("scraper", String),
Column("transformer", String),
Column("platform", String),
Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column("date", DateTime, index=True),
Column("date_archived", DateTime, index=True),
Column("date_transformed", DateTime, index=True),
Column("url", String),
Column("author_id", String),
Column("author_username", String),
Column("content", String),
Column("forwarded_from", Integer, ForeignKey("channels.id"), index=True),
Column("reply_to", Integer, ForeignKey("posts.id"), index=True),
Column("named_entities", JSON),
Column("cryptocurrency_addresses", JSON),
Column("hashtags", JSON),
Column("outlinks", JSON),
Column("mentions", JSON),
Column("likes", Integer),
Column("forwards", Integer),
Column("views", Integer),
Column("video_title", String),
Column("video_duration", Integer),
Column("detected_language", String, index=True),
Column("normalized_content", String),
)
posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from)
posts_forwarded_from_channel_index = Index(
"posts_channel_forwarded_from_idx",
post_table.c.channel,
post_table.c.forwarded_from,
)
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('post', Integer, ForeignKey('posts.id'), index=True),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
Column('scraper', String),
Column('transformer', String)
)
media_table = Table(
"media",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("type", String),
Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
Column("post", Integer, ForeignKey("posts.id"), index=True),
Column("url", String),
Column("original_url", String),
Column("exif", String),
Column("ocr", String),
Column("date", DateTime, index=True),
Column("date_archived", DateTime, index=True),
Column("date_transformed", DateTime, index=True),
Column("scraper", String),
Column("transformer", String),
)
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
mapper_registry.map_imperatively(Audio, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='audio')
mapper_registry.map_imperatively(
Media, media_table, polymorphic_on="type", polymorphic_identity="media"
)
mapper_registry.map_imperatively(
Image,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="image",
)
mapper_registry.map_imperatively(
Video,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="video",
)
mapper_registry.map_imperatively(
Audio,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="audio",
)

View File

@@ -3,4 +3,4 @@ from .base import Scraper, ScraperController, ChannelDoesNotExistError
from .bitchute import BitchuteScraper
from .gettr import GettrScraper
from .rumble import RumbleScraper
from .telegram_telethon import TelegramTelethonScraper
from .telegram_telethon import TelegramTelethonScraper

View File

@@ -18,50 +18,54 @@ from sqlalchemy import nullsfirst
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
from cisticola.utils import make_request
class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform.
"""
__version__ = "Scraper 0.0.0"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
cookiestring = (
os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
)
cookiefilename = "cookiefile.txt"
def __init__(self):
# Initialize client to transfer files to the storage archive
self.s3_client = boto3.client(
service_name='s3',
region_name=os.environ['DO_SPACES_REGION'],
service_name="s3",
region_name=os.environ["DO_SPACES_REGION"],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'],
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
# Define request headers (necessary to bypass scraping protection
aws_access_key_id=os.environ["DO_SPACES_KEY"],
aws_secret_access_key=os.environ["DO_SPACES_SECRET"],
)
# Define request headers (necessary to bypass scraping protection
# for several platform scrapers)
self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
def __str__(self):
return self.__version__
def get_username_from_url(self, url: str) -> str:
"""Extract a channel's username from its URL.
"""Extract a channel's username from its URL.
Parameters
----------
url: str
URL of the channel on a given platform
e.g. ``"https://twitter.com/EliotHiggins"``
Returns
-------
username: str
Extracted username of the channel.
e.g. ``"EliotHiggins"``
"""
raise NotImplementedError
def url_to_key(self, url: str, content_type: str) -> str:
@@ -70,21 +74,21 @@ class Scraper:
Parameters
---------
url: str
URL of original post.
URL of original post.
e.g. ``"https://twitter.com/bellingcat/status/1503397267675533313"``
content_type: str
Content-Type of media.
Content-Type of media.
e.g. ``"image/jpeg"``
Returns
-------
key: str
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
Unique identifier for the media file from a specified post based on
the original post URL and the media's Content-Type.
"""
key = urlparse(url).path.split('/')[-1]
return key
key = urlparse(url).path.split("/")[-1]
return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media file URL.
@@ -92,7 +96,7 @@ class Scraper:
Parameters
---------
url: str
URL of media file from original post.
URL of media file from original post.
e.g. ``"https://pbs.twimg.com/media/FN0j0dYWUAcQxfK?format=png&name=medium"``
key: str or None
Pre-defined unique identifier for the media file.
@@ -100,18 +104,18 @@ class Scraper:
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
Content-Type of media.
e.g. ``"image/jpeg"``.
key: str
Unique identifier for the media file.
"""
r = make_request(url, headers = self.headers)
r = make_request(url, headers=self.headers)
blob = r.content
content_type = r.headers.get('Content-Type')
content_type = r.headers.get("Content-Type")
if key is None:
key = self.url_to_key(url, content_type)
@@ -119,13 +123,13 @@ class Scraper:
return blob, content_type, key
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, where the media file
"""Download media file from a specified media URL, where the media file
is formatted as an m3u8 playlist, which is then decoded to an mp4 file.
Parameters
---------
url: str
URL of m3u8 playlist file from original post.
URL of m3u8 playlist file from original post.
e.g. ``"https://media.gettr.com/group47/origin/2022/03/15/01/cbc436c1-1a1a-4b97-671d-c42109f3ec9b/out.m3u8"``
key: str or None
Pre-defined unique identifier for the media file.
@@ -133,42 +137,41 @@ class Scraper:
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
content_type = "video/mp4"
ext = "." + content_type.split("/")[-1]
with tempfile.NamedTemporaryFile(suffix=ext) as temp_file:
(
ffmpeg
.input(url)
.output(temp_file.name, vcodec='copy')
.global_args('-loglevel', 'error')
.run(overwrite_output=True))
ffmpeg.input(url)
.output(temp_file.name, vcodec="copy")
.global_args("-loglevel", "error")
.run(overwrite_output=True)
)
temp_file.seek(0)
blob = temp_file.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
key = self.url_to_key(url=url, content_type=content_type)
return blob, content_type, key
def ytdlp_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
"""Download media file from a specified media URL, using a fork of
"""Download media file from a specified media URL, using a fork of
youtube-dl that enables faster downloading.
Parameters
---------
url: str
URL of media file from original post.
URL of media file from original post.
e.g. ``"https://rumble.com/embed/vgt7gh/"``
key: str or None
Pre-defined unique identifier for the media file.
@@ -176,21 +179,21 @@ class Scraper:
Returns
-------
blob: bytes
Raw bytes of the downloaded media file.
Raw bytes of the downloaded media file.
content_type: str
Content-Type of media.
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
"""
content_type = 'video/mp4'
content_type = "video/mp4"
with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir)/self.cookiefilename
with open(cookiefile, 'w') as f:
cookiefile = Path(temp_dir) / self.cookiefilename
with open(cookiefile, "w") as f:
f.write(self.cookiestring)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
@@ -199,37 +202,39 @@ class Scraper:
"quiet": True,
"verbose": False,
"retries": 5,
"cookiefile": cookiefile}
"cookiefile": cookiefile,
}
ydl = yt_dlp.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
download=True,
)
except yt_dlp.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
key = self.url_to_key(url=url, content_type=content_type)
return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
"""Upload raw bytes of a media file to the storage archive.
"""Upload raw bytes of a media file to the storage archive.
Parameters
----------
blob: bytes
Raw bytes of the media file to be archived.
content_type: str
Content-Type of media.
Content-Type of media.
e.g. ``"video/mp4"``.
key: str
Unique identifier for the media file.
@@ -240,18 +245,22 @@ class Scraper:
URL specifying the file on the storage archive.
"""
filename = self.__version__.replace(' ', '_') + '/' + key
filename = self.__version__.replace(" ", "_") + "/" + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[
'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
self.s3_client.upload_fileobj(
BytesIO(blob),
Bucket=os.environ["DO_BUCKET"],
Key=filename,
ExtraArgs={"ACL": "public-read", "ContentType": content_type},
)
archived_url = os.environ['DO_URL'] + '/' + filename
archived_url = os.environ["DO_URL"] + "/" + filename
return archived_url
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
"""Archive files corresponding to ``archived_url`` dict keys, if the
"""Archive files corresponding to ``archived_url`` dict keys, if the
files have not previously been archived.
Parameters
@@ -280,19 +289,21 @@ class Scraper:
Parameters
----------
channel: Channel
Channel to be scraped.
Channel to be scraped.
Returns
-------
bool
``True`` if the scraper is capable of scraping ``channel``,
``False`` if not.
``False`` if not.
"""
raise NotImplementedError
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
Parameters
@@ -300,7 +311,7 @@ class Scraper:
channel: Channel
Channel to be scraped.
since: ScraperResult or None
Most recently scraped ScraperResult from a previous scrape, or
Most recently scraped ScraperResult from a previous scrape, or
``None`` if scraper has not run before.
Yields
@@ -308,7 +319,7 @@ class Scraper:
ScraperResult
Scraper result from a single post/comment from the specified Channel.
"""
raise NotImplementedError
@@ -342,8 +353,7 @@ class ScraperController:
self.scrapers.extend(scrapers)
def remove_all_scrapers(self):
"""Reset the ScraperController so that it doesn't control any scrapers
"""
"""Reset the ScraperController so that it doesn't control any scrapers"""
self.scrapers = []
def scrape_all_channels(self, fetch_old: bool = False):
@@ -362,15 +372,23 @@ class ScraperController:
session = self.session()
# TODO there should be a better/more generic way of selecting scrapeable channels
channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).all()
channels = (
session.query(Channel)
.filter(
(Channel.source == "researcher")
| (Channel.source == "snowball_it")
| (Channel.source == "snowball_complete")
| (Channel.source == "linked_channel")
)
.all()
)
session.close()
return self.scrape_channels(channels, fetch_old=fetch_old)
def scrape_all_channel_info(self):
"""Scrape profile information from all channels in the database.
"""
"""Scrape profile information from all channels in the database."""
if self.session is None:
logger.error("No DB session")
return
@@ -379,17 +397,34 @@ class ScraperController:
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
# This will sort the channels by the least recently scraped.
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
channels = session.query(Channel).\
filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).\
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
most_recently_archived = (
session.query(
func.max(RawChannelInfo.date_archived).label("date"),
RawChannelInfo.channel.label("channel"),
)
.group_by(RawChannelInfo.channel)
.subquery()
)
channels = (
session.query(Channel)
.filter(
(Channel.source == "researcher")
| (Channel.source == "snowball_it")
| (Channel.source == "snowball_complete")
| (Channel.source == "linked_channel")
)
.outerjoin(
most_recently_archived, Channel.id == most_recently_archived.c.channel
)
.order_by(nullsfirst(most_recently_archived.c.date.asc()))
.all()
)
session.close()
return self.scrape_channel_info(channels)
def scrape_channels(self, channels: List[Channel], fetch_old: bool = False):
"""Scrape all posts from a specified list of channels.
"""Scrape all posts from a specified list of channels.
Parameters
----------
@@ -408,12 +443,17 @@ class ScraperController:
# If any channels are not already in the database, add them
for channel in channels:
platform_id = None
if channel.platform_id not in (None, ''):
if channel.platform_id not in (None, ""):
platform_id = channel.platform_id
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
channel_in_db = (
session.query(Channel)
.filter_by(
platform_id=platform_id, platform=channel.platform, url=channel.url
)
.first()
)
if not channel_in_db:
logger.debug(f"{channel} does not exist in database, adding")
@@ -429,13 +469,17 @@ class ScraperController:
handled = True
added = 0
if fetch_old and channel.platform == 'Telegram':
if fetch_old and channel.platform == "Telegram":
# get oldest post (currently only for Telegram)
# TODO fix this so that it doesn't have an explicit check on channel.platform (should be generic)
# TODO implement until on all scrapers
rows = session.query(ScraperResult).where(
ScraperResult.channel == channel.id).order_by(
ScraperResult.date.asc(), ScraperResult.id.desc()).limit(10).all()
rows = (
session.query(ScraperResult)
.where(ScraperResult.channel == channel.id)
.order_by(ScraperResult.date.asc(), ScraperResult.id.desc())
.limit(10)
.all()
)
if len(rows) > 0:
until = rows[0]
@@ -449,9 +493,13 @@ class ScraperController:
# Note: a "bug" in Postgres can cause this query to hang for a really long time
# when searching for a single row, hence the limit(10).all() when we really just need
# the first row.
rows = session.query(ScraperResult).where(
ScraperResult.channel == channel.id).order_by(
ScraperResult.date.desc(), ScraperResult.id.asc()).limit(10).all()
rows = (
session.query(ScraperResult)
.where(ScraperResult.channel == channel.id)
.order_by(ScraperResult.date.desc(), ScraperResult.id.asc())
.limit(10)
.all()
)
if len(rows) > 0:
since = rows[0]
@@ -466,8 +514,7 @@ class ScraperController:
added += 1
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")
logger.info(f"{scraper} found {added} new posts from {channel}")
break
if not handled:
@@ -475,7 +522,7 @@ class ScraperController:
session.close()
def archive_unarchived_media_batch(self, session = None, chronological=False):
def archive_unarchived_media_batch(self, session=None, chronological=False):
"""Archive previously unarchived media URLs from a batch of raw_post rows.
Parameters
@@ -489,11 +536,24 @@ class ScraperController:
if session is None:
session = self.session()
if chronological:
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
posts = (
session.query(ScraperResult)
.where(ScraperResult.media_archived == None)
.where(ScraperResult.id >= 0)
.order_by(ScraperResult.date.desc())
.limit(5000)
.all()
)
else:
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
posts = (
session.query(ScraperResult)
.where(ScraperResult.media_archived == None)
.order_by(func.random())
.limit(5000)
.all()
)
logger.info(f"Found {len(posts)} posts without media. Archiving now")
@@ -502,23 +562,33 @@ class ScraperController:
for scraper in self.scrapers:
# compare major versions
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
if (
post.scraper is not None
and scraper.__version__.split(".")[0] == post.scraper.split(".")[0]
):
handled = True
logger.debug(f"{scraper} is archiving media for ID {post.id}")
post = scraper.archive_files(post)
if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
session.query(ScraperResult).where(
ScraperResult.id == post.id
).update(
{
"archived_urls": post.archived_urls,
"media_archived": post.media_archived,
}
)
session.commit()
break
if not handled:
logger.warning(f"No handler found for post scraped with {post.scraper}")
session.commit()
@logger.catch(reraise = True)
@logger.catch(reraise=True)
def archive_unarchived_media(self, chronological=False):
"""Archive previously unarchived media URLs from all raw_post rows.
@@ -535,11 +605,13 @@ class ScraperController:
session = self.session()
while True:
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
@logger.catch(reraise = True)
self.archive_unarchived_media_batch(
self, session=session, chronological=chronological
)
@logger.catch(reraise=True)
def scrape_channel_info(self, channels: List[Channel]):
"""Scrape channel info for specified channels.
"""Scrape channel info for specified channels.
Parameters
----------
@@ -571,8 +643,7 @@ class ScraperController:
session.add(info)
session.commit()
logger.info(
f"{scraper} found {info}")
logger.info(f"{scraper} found {info}")
break
except ChannelDoesNotExistError:
logger.warning(f"ChannelDoesNotExist {channel}")
@@ -590,7 +661,7 @@ class ScraperController:
engine: sqlalchemy.engine.Engine
Instance of SQLAlchemy engine to connect to
"""
# create tables
mapper_registry.metadata.create_all(bind=engine)
@@ -599,13 +670,13 @@ class ScraperController:
self.session.configure(bind=self.engine)
def reset_db(self):
"""Drop all data from the connected SQLAlchemy database.
"""
"""Drop all data from the connected SQLAlchemy database."""
close_all_sessions()
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)
class ChannelDoesNotExistError(Exception):
"""The specified channel does not exist or has been deleted."""

View File

@@ -1,6 +1,6 @@
from datetime import datetime, timezone
import time
import re
import re
from html.parser import HTMLParser
import dateparser
import json
@@ -14,106 +14,130 @@ from loguru import logger
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
__version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
username = url.split("bitchute.com/channel/")[-1].strip("/")
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
session = requests.Session()
session.headers.update(self.headers)
request = session.get("https://www.bitchute.com/search")
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
csrftoken = (
BeautifulSoup(request.text, "html.parser")
.findAll("input", {"name": "csrfmiddlewaretoken"})[0]
.get("value")
)
time.sleep(0.25)
detail = 'comments'
detail = "comments"
username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper:
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
if (
since is not None
and datetime.fromtimestamp(post["timestamp"]) <= since.date
):
break
archived_urls = {}
if 'video_url' in post:
url = post['video_url']
archived_urls[url] = None
if "video_url" in post:
url = post["video_url"]
archived_urls[url] = None
yield ScraperResult(
scraper=self.__version__,
platform="Bitchute",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
platform_id=post["id"],
date=datetime.fromtimestamp(post["timestamp"]),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=None)
media_archived=None,
)
def can_handle(self, channel):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
if (
channel.platform == "Bitchute"
and self.get_username_from_url(channel.url) is not None
):
return True
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
session = requests.session()
response = session.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
canonical_url = soup.find('link', {'id' : 'canonical'})['href']
csrftoken = session.cookies['csrftoken']
csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']
soup = BeautifulSoup(response.content, "html.parser")
about_soup = soup.find('div', {'id' : 'channel-about'})
info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p')
description_soup = about_soup.find('div', {'id' : 'channel-description'})
canonical_url = soup.find("link", {"id": "canonical"})["href"]
csrftoken = session.cookies["csrftoken"]
csrfmiddlewaretoken = soup.find("input", {"name": "csrfmiddlewaretoken"})[
"value"
]
headers = {'Referer': base_url}
data = {
'csrftoken': csrftoken,
'csrfmiddlewaretoken': csrfmiddlewaretoken}
about_soup = soup.find("div", {"id": "channel-about"})
info_list = about_soup.find("div", {"class": "channel-about-details"}).find_all(
"p"
)
description_soup = about_soup.find("div", {"id": "channel-description"})
response = session.post(canonical_url + 'counts/', data = data, headers = headers)
headers = {"Referer": base_url}
data = {"csrftoken": csrftoken, "csrfmiddlewaretoken": csrfmiddlewaretoken}
response = session.post(canonical_url + "counts/", data=data, headers=headers)
counts = json.loads(response.text)
owner_soup = soup.find('p', {'class' : 'owner'})
if owner_soup.text == '[email\xa0protected]':
owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail'])
owner_soup = soup.find("p", {"class": "owner"})
if owner_soup.text == "[email\xa0protected]":
owner_name = decode_cfemail(
owner_soup.find("span", {"class": "__cf_email__"})["data-cfemail"]
)
else:
owner_name = owner_soup.text
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : owner_name,
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}
return RawChannelInfo(scraper=self.__version__,
"description": description_soup.text.strip(),
"description_links": [
a["href"] for a in description_soup.find_all("a", href=True)
],
"created": re.sub(
r"\s", " ", info_list[0].text.split("Created")[1].strip(". ")
),
"videos": int(info_list[1].text.split("videos")[0].strip()),
"owner_url": soup.find("p", {"class": "owner"}).find("a", href=True)[
"href"
],
"owner_name": owner_name,
"image": about_soup.find("img", {"alt": "Channel Image"}).get("data-src"),
"subscribers": counts["subscriber_count"],
"views": int(counts["about_view_count"].split(" ")[0]),
}
return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))
raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc),
)
def strip_tags(html, convert_newlines=True):
r"""
Strip HTML from a string
@@ -149,6 +173,7 @@ def strip_tags(html, convert_newlines=True):
stripper.feed(html)
return stripper.get_data()
def request_from_bitchute(session, method, url, headers=None, data=None):
"""
Request something via the BitChute API (or non-API)
@@ -176,7 +201,10 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
raise NotImplemented()
if request.status_code >= 300:
raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
raise ValueError(
"Response %i from BitChute for URL %s, need to retry"
% (request.status_code, url)
)
response = request.json()
return response
@@ -193,6 +221,7 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
return response
def append_details(video, detail):
"""
Append extra metadata to video data
@@ -214,7 +243,7 @@ def append_details(video, detail):
"comments": "",
"hashtags": "",
"parent_id": "",
"video_url": ""
"video_url": "",
}
try:
@@ -223,15 +252,23 @@ def append_details(video, detail):
video_session = requests.session()
video_page = video_session.get(video["url"])
if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
"<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
"<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
"<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
if (
'<h1 class="page-title">Video Restricted</h1>' in video_page.text
or '<h1 class="page-title">Video Blocked</h1>' in video_page.text
or '<h1 class="page-title">Channel Blocked</h1>' in video_page.text
or '<h1 class="page-title">Channel Restricted</h1>' in video_page.text
):
if (
"This video is unavailable as the contents have been deemed potentially illegal"
in video_page.text
):
video["category"] = "moderated-illegal"
return (video, [])
elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
elif (
"Viewing of this video is restricted, as it has been marked as Not Safe For Life"
in video_page.text
):
video["category"] = "moderated-nsfl"
return (video, [])
@@ -255,39 +292,47 @@ def append_details(video, detail):
video["category"] = "moderated-other"
return (video, [])
elif "<iframe class=\"rumble\"" in video_page.text:
elif '<iframe class="rumble"' in video_page.text:
# some videos are actually embeds from rumble?
# these are iframes, so at the moment we cannot simply extract
# their info from the page, so we skip them. In the future we
# could add an extra request to get the relevant info, but so
# far the only examples I've seen are actually 'video not found'
video = {
**video,
"category": "error-embed-from-rumble"
}
video = {**video, "category": "error-embed-from-rumble"}
return (video, [])
elif video_page.status_code != 200:
video = {
**video,
"category": "error-%i" % video_page.status_code
}
video = {**video, "category": "error-%i" % video_page.status_code}
return (video, [])
soup = BeautifulSoup(video_page.text, 'html.parser')
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
soup = BeautifulSoup(video_page.text, "html.parser")
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
"value"
)
video["video_url"] = soup.select_one("video#player source").get("src")
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
video["subject"] = soup.select_one("h1#video-title").text
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
video["author"] = (
soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
)
video["body"] = (
soup.select_one("div#video-description")
.encode_contents()
.decode("utf-8")
.strip()
)
# we need *two more requests* to get the comment count and like/dislike counts
# this seems to be because bitchute uses a third-party comment widget
video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
video_session.headers = {"Referer": video["url"], "Origin": video["url"]}
counts = request_from_bitchute(
video_session,
"POST",
"https://www.bitchute.com/video/%s/counts/" % video["id"],
data={"csrfmiddlewaretoken": video_csfrtoken},
)
if detail == "comments":
# if comments are also to be scraped, this is anothe request to make, which returns
@@ -308,7 +353,12 @@ def append_details(video, detail):
comment_count = 0
url = comment_script.split("'")[1]
comment_csrf = comment_script.split("'")[3]
comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
comments_data = request_from_bitchute(
video_session,
"POST",
url + "/api/get_comments/",
data={"cf_auth": comment_csrf, "commentCount": 0},
)
for comment in comments_data:
comment_count += 1
@@ -318,47 +368,61 @@ def append_details(video, detail):
else:
thumbnail_image = ""
comments.append({
"id": comment["id"],
"thread_id": video["id"],
"subject": "",
"body": comment["content"],
"author": comment["fullname"],
"author_id": comment["creator"],
"timestamp": int(dateparser.parse(comment["created"]).timestamp()),
"url": "",
"views": "",
"length": "",
"hashtags": "",
"thumbnail_image": thumbnail_image,
"likes": comment["upvote_count"],
"category": "comment",
"dislikes": "",
"channel_subscribers": "",
"comments": "",
"parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
})
comments.append(
{
"id": comment["id"],
"thread_id": video["id"],
"subject": "",
"body": comment["content"],
"author": comment["fullname"],
"author_id": comment["creator"],
"timestamp": int(
dateparser.parse(comment["created"]).timestamp()
),
"url": "",
"views": "",
"length": "",
"hashtags": "",
"thumbnail_image": thumbnail_image,
"likes": comment["upvote_count"],
"category": "comment",
"dislikes": "",
"channel_subscribers": "",
"comments": "",
"parent_id": comment.get("parent", "")
if "parent" in comment
else video["id"],
}
)
else:
# if we don't need the full comments, we still need another request to get the *amount*
# of comments,
comment_count = request_from_bitchute(video_session, "POST",
comment_count = request_from_bitchute(
video_session,
"POST",
"https://commentfreely.bitchute.com/api/get_comment_count/",
data={"csrfmiddlewaretoken": video_csfrtoken,
"cf_thread": "bc_" + video["id"]})["commentCount"]
data={
"csrfmiddlewaretoken": video_csfrtoken,
"cf_thread": "bc_" + video["id"],
},
)["commentCount"]
except RuntimeError as e:
# we wrap this in one big try-catch because doing it for each request separarely is tedious
# hm... maybe this should be in a helper function
# self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
# is_final=True)
# self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
# is_final=True)
return (None, None)
# again, no structured info available for the publication date, but at least we can extract the
# exact day it was uploaded
try:
published = dateparser.parse(
soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
soup.find(class_="video-publish-date")
.text.split("published at")[1]
.strip()[:-1]
)
except AttributeError as e:
# publication date not on page?
published = None
@@ -373,7 +437,7 @@ def append_details(video, detail):
"comments": comment_count,
"parent_id": "",
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
"views": counts["view_count"]
"views": counts["view_count"],
}
if published:
@@ -383,6 +447,7 @@ def append_details(video, detail):
time.sleep(0.25)
return (video, comments)
def get_videos_user(session, user, csrftoken, detail):
"""
Scrape videos for given BitChute user
@@ -397,27 +462,31 @@ def get_videos_user(session, user, csrftoken, detail):
max_items = 100
num_items = 0
offset = 0
base_url = "https://www.bitchute.com/channel/%s/" % user
url = base_url + "extend/"
container = session.get(base_url)
container_soup = BeautifulSoup(container.text, 'html.parser')
headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
container_soup = BeautifulSoup(container.text, "html.parser")
headers = {"Referer": base_url, "Origin": "https://www.bitchute.com/"}
while True:
post_data = {
"csrfmiddlewaretoken": csrftoken,
"name": "",
"offset": str(offset),
}
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
response = request_from_bitchute(
session, "POST", url, headers=headers, data=post_data
)
response = request_from_bitchute(session, "POST", url, headers=headers, data=post_data)
soup = BeautifulSoup(response["html"], 'html.parser')
soup = BeautifulSoup(response["html"], "html.parser")
videos = soup.select(".channel-videos-container")
comments = []
if len(videos) == 0 or num_items >= max_items:
break
for video_element in videos:
if num_items >= max_items:
@@ -432,16 +501,26 @@ def get_videos_user(session, user, csrftoken, detail):
"id": link["href"].split("/")[-2],
"thread_id": link["href"].split("/")[-2],
"subject": link.text,
"body": strip_tags(video_element.select_one(".channel-videos-text").text),
"body": strip_tags(
video_element.select_one(".channel-videos-text").text
),
"author": container_soup.select_one(".details .name a").text,
"author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
"author_id": container_soup.select_one(".details .name a")[
"href"
].split("/")[2],
"timestamp": int(
dateparser.parse(
video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
video_element.select_one(
".channel-videos-details.text-right.hidden-xs"
).text
).timestamp()
),
"url": "https://www.bitchute.com" + link["href"],
"views": video_element.select_one(".video-views").text.strip(),
"length": video_element.select_one(".video-duration").text.strip(),
"thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
"thumbnail_image": video_element.select_one(
".channel-videos-image img"
)["src"],
}
if detail != "basic":
@@ -456,15 +535,14 @@ def get_videos_user(session, user, csrftoken, detail):
# before the video, which is weird
yield comment
def decode_cfemail(cfemail):
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email
"""
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email"""
email = ""
k = int(cfemail[:2], 16)
for i in range(2, len(cfemail)-1, 2):
email += chr(int(cfemail[i:i+2], 16)^k)
for i in range(2, len(cfemail) - 1, 2):
email += chr(int(cfemail[i : i + 2], 16) ^ k)
return email
return email

View File

@@ -9,8 +9,10 @@ from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
def get_username_from_url(self, url):
@@ -21,49 +23,58 @@ class GettrScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = self.get_username_from_url(channel.url).lower()
scraper = client.user_activity(username=username, type="posts")
for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
if (
since is not None
and datetime.fromtimestamp(post["cdate"] * 0.001) <= since.date
):
break
archived_urls = {}
if 'imgs' in post:
for img in post['imgs']:
if "imgs" in post:
for img in post["imgs"]:
url = "https://media.gettr.com/" + img
archived_urls[url] = None
if 'main' in post:
url = "https://media.gettr.com/" + post['main']
if "main" in post:
url = "https://media.gettr.com/" + post["main"]
archived_urls[url] = None
if 'ovid' in post:
url = "https://media.gettr.com/" + post['ovid']
if "ovid" in post:
url = "https://media.gettr.com/" + post["ovid"]
archived_urls[url] = None
yield ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=channel.id,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
platform_id=post["_id"],
date=datetime.fromtimestamp(post["cdate"] / 1000.0),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post),
archived_urls=archived_urls,
media_archived=None)
media_archived=None,
)
def can_handle(self, channel):
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
if (
channel.platform == "Gettr"
and self.get_username_from_url(channel.url) is not None
):
return True
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
ext = "." + content_type.split("/")[-1]
key = urlparse(url).path.split("/")[-2] + ext
return key
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
@@ -71,8 +82,10 @@ class GettrScraper(Scraper):
username = self.get_username_from_url(channel.url)
profile = client.user_info(username)
return RawChannelInfo(scraper=self.__version__,
return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
date_archived=datetime.now(timezone.utc),
)

View File

@@ -10,25 +10,32 @@ import os
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com'
BASE_URL = "https://rumble.com"
class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.2"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
cookiefilename = 'cookiefile.txt'
cookiestring = (
os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
)
cookiefilename = "cookiefile.txt"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
scraper = get_channel_videos(channel.url)
for post in scraper:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
if since is not None and post["datetime"].replace(
tzinfo=timezone.utc
) <= since.date.replace(tzinfo=timezone.utc):
break
url = post['media_url']
url = post["media_url"]
archived_urls = {url: None}
@@ -36,17 +43,18 @@ class RumbleScraper(Scraper):
scraper=self.__version__,
platform="Rumble",
channel=channel.id,
platform_id=post['media_url'].split('/')[-2],
date=post['datetime'].replace(tzinfo=timezone.utc),
platform_id=post["media_url"].split("/")[-2],
date=post["datetime"].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default = str),
raw_data=json.dumps(post, default=str),
archived_urls=archived_urls,
media_archived=None)
media_archived=None,
)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
ext = "." + content_type.split("/")[-1]
key = urlparse(url).path.split("/")[-2] + ext
return key
@logger.catch
def archive_files(self, result: ScraperResult) -> ScraperResult:
@@ -65,113 +73,117 @@ class RumbleScraper(Scraper):
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url=channel.url)
profile = get_channel_profile(url = channel.url)
return RawChannelInfo(scraper=self.__version__,
return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc))
date_archived=datetime.now(timezone.utc),
)
def get_media_url(url):
r = make_request(url = url)
soup = BeautifulSoup(r.content, features = 'html.parser')
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
media_url = script[0]['embedUrl']
r = make_request(url=url)
soup = BeautifulSoup(r.content, features="html.parser")
script = json.loads(
"".join(soup.find("script", {"type": "application/ld+json"}).text)
)
media_url = script[0]["embedUrl"]
return media_url
def process_video(video):
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
if rumble_soup is None:
rumbles = '0'
else:
rumbles = rumble_soup['data-value']
view_span = video.find('span', {'class' : 'video-item--views'})
def process_video(video):
rumble_soup = video.find("span", {"class": "video-item--rumbles"})
if rumble_soup is None:
rumbles = "0"
else:
rumbles = rumble_soup["data-value"]
view_span = video.find("span", {"class": "video-item--views"})
if view_span is None:
views = None
else:
views = view_span.get('data-value')
author_a = video.find('a', {'rel': 'author'})
views = view_span.get("data-value")
author_a = video.find("a", {"rel": "author"})
if author_a is None:
author_id = None
author_name = None
else:
author_id = author_a['href'].split('/')[-1]
author_id = author_a["href"].split("/")[-1]
author_name = author_a.text
video_link = BASE_URL + video.find('a', href = True)['href']
r = make_request(url = video_link)
soup = BeautifulSoup(r.content, features = 'html.parser')
content_div = soup.find('div', {'class': 'container content media-description'})
video_link = BASE_URL + video.find("a", href=True)["href"]
r = make_request(url=video_link)
soup = BeautifulSoup(r.content, features="html.parser")
content_div = soup.find("div", {"class": "container content media-description"})
info = {
'title' : video.find('h3').text,
'thumbnail' : video.find('img')['src'],
'link' : video_link,
'views' : views,
'rumbles' : rumbles,
'content': '' if content_div is None else content_div.get_text('\n'),
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
'author_id': author_id,
'author_name': author_name}
info['media_url'] = get_media_url(info['link'])
"title": video.find("h3").text,
"thumbnail": video.find("img")["src"],
"link": video_link,
"views": views,
"rumbles": rumbles,
"content": "" if content_div is None else content_div.get_text("\n"),
"duration": video.find("span", {"class": "video-item--duration"})["data-value"],
"datetime": datetime.fromisoformat(video.find("time")["datetime"]),
"author_id": author_id,
"author_name": author_name,
}
info["media_url"] = get_media_url(info["link"])
return info
def get_channel_videos(url):
page = 1
channel_url = f'{url}?page='
channel_url = f"{url}?page="
while True:
url = channel_url + str(page)
r = make_request(url = url, break_codes = [404])
r = make_request(url=url, break_codes=[404])
if r.status_code == 404:
break
soup = BeautifulSoup(r.content, features = 'html.parser')
soup = BeautifulSoup(r.content, features="html.parser")
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
video_list = soup.find_all("li", {"class": "video-listing-entry"})
for video in video_list:
yield process_video(video)
page += 1
def get_channel_profile(url):
channel_url = f"{url}"
r = make_request(url=channel_url)
soup = BeautifulSoup(r.content, features="lxml")
channel_url = f'{url}'
r = make_request(url = channel_url)
soup = BeautifulSoup(r.content, features = 'lxml')
verified_svg = soup.find("h1").find("svg", {"class": "listing-header--verified"})
thumbnail_soup = soup.find("img", {"class": "listing-header--thumb"})
cover_soup = soup.find("img", {"class": "listing-header--backsplash-img"})
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
author_a = soup.find('a', {'rel': 'author'})
author_a = soup.find("a", {"rel": "author"})
if author_a is None:
author_id = None
else:
author_id = author_a['href'].split('/')[-1]
author_id = author_a["href"].split("/")[-1]
profile = {
'name': soup.find('h1').text,
'id': author_id,
'verified': verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
return profile
"name": soup.find("h1").text,
"id": author_id,
"verified": verified_svg is not None,
"thumbnail": thumbnail_soup.get("src") if thumbnail_soup else None,
"cover": cover_soup.get("src") if cover_soup else None,
"subscribers": soup.find("span", {"class": "subscribe-button-count"}).text,
}
return profile

View File

@@ -7,26 +7,28 @@ from pathlib import Path
import time
from loguru import logger
from telethon.sync import TelegramClient
from telethon.sync import TelegramClient
from telethon.tl.functions.channels import GetFullChannelRequest
from telethon.tl import types
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
MEDIA_TYPES = ["photo", "video", "document", "webpage"]
class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.4"
client = None
def __init__(self, telethon_session_name = None):
def __init__(self, telethon_session_name=None):
super().__init__()
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ["TELEGRAM_PHONE"]
if telethon_session_name is None:
telethon_session_name = phone
@@ -40,14 +42,14 @@ class TelegramTelethonScraper(Scraper):
self.client.disconnect()
def get_username_from_url(url):
username = url.split('https://t.me/')[1]
if username.startswith('s/'):
username = username.split('s/')[1]
username = url.split("https://t.me/")[1]
if username.startswith("s/"):
username = username.split("s/")[1]
return username
def get_channel_identifier(channel: Channel):
identifier = channel.platform_id
if identifier is None:
identifier = channel.screenname
if identifier is None:
@@ -63,14 +65,18 @@ class TelegramTelethonScraper(Scraper):
return result
if len(list(result.archived_urls.keys())) != 1:
logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
logger.warning(
f"Expected 1 key in archived_urls, found {result.archived_keys}"
)
else:
key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None:
raw = json.loads(result.raw_data)
message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
message = self.client.get_messages(
raw["peer_id"]["channel_id"], ids=[raw["id"]]
)
blob = None
output_file_with_ext = None
@@ -81,35 +87,43 @@ class TelegramTelethonScraper(Scraper):
if blob is not None:
# TODO specify Content-Type
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
archived_url = self.archive_blob(
blob=blob, content_type="", key=output_file_with_ext
)
result.archived_urls[key] = archived_url
result.media_archived = datetime.now(timezone.utc)
else:
if output_file_with_ext == 'largefile':
logger.info("Because this was a large file, not clearing media data")
if output_file_with_ext == "largefile":
logger.info(
"Because this was a large file, not clearing media data"
)
return result
logger.warning("Downloaded blob was None")
result.archived_urls = {}
result.media_archived = datetime.now(timezone.utc)
return result
def archive_post_media(self, post : types.Message):
def archive_post_media(self, post: types.Message):
if post.media is None:
logger.debug("No media for post")
return None, None
if type(post.media) == types.MessageMediaDocument:
if post.media.document.size/(1024*1024) > 50:
logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
if post.media.document.size / (1024 * 1024) > 50:
logger.info(
f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
)
return (None, "largefile")
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
logger.debug(
f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
)
else:
logger.debug(f"Archiving {type(post.media)}")
key = f'{post.peer_id.channel_id}_{post.id}'
key = f"{post.peer_id.channel_id}_{post.id}"
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key)
@@ -122,8 +136,8 @@ class TelegramTelethonScraper(Scraper):
output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext)
with open(filename, 'rb') as f:
with open(filename, "rb") as f:
blob = f.read()
return (blob, output_file_with_ext)
@@ -132,28 +146,41 @@ class TelegramTelethonScraper(Scraper):
return True
# @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]:
def get_posts(
self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
username = TelegramTelethonScraper.get_channel_identifier(channel)
if until is not None:
logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}")
iterator = self.client.iter_messages(username, max_id=int(until.platform_id.split('/')[-1]), wait_time=0, limit=None)
logger.info(
f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}"
)
iterator = self.client.iter_messages(
username,
max_id=int(until.platform_id.split("/")[-1]),
wait_time=0,
limit=None,
)
else:
iterator = self.client.iter_messages(username)
post = None
for post in iterator:
post_url = f'{channel.url}/{post.id}'
post_url = f"{channel.url}/{post.id}"
logger.trace(f"Archiving post {post_url} from {post.date}")
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
if since is not None and post.date.replace(
tzinfo=timezone.utc
) <= since.date.replace(tzinfo=timezone.utc):
logger.info(
f"Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}"
)
break
archived_urls = {}
media_archived = datetime(1970, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
if post.media is not None:
if post.media is not None:
archived_urls[post_url] = None
media_archived = None
@@ -166,10 +193,18 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=media_archived)
media_archived=media_archived,
)
if (post is not None and post.id > 1 and since is None) or (post is not None and since is not None and post.date.replace(tzinfo=timezone.utc) > since.date.replace(tzinfo=timezone.utc)):
logger.info(f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again")
if (post is not None and post.id > 1 and since is None) or (
post is not None
and since is not None
and post.date.replace(tzinfo=timezone.utc)
> since.date.replace(tzinfo=timezone.utc)
):
logger.info(
f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again"
)
new_until = ScraperResult(
scraper=self.__version__,
platform="Telegram",
@@ -179,19 +214,21 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=media_archived)
media_archived=media_archived,
)
for p in self.get_posts(channel, since=since, until=new_until):
yield p
yield p
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = TelegramTelethonScraper.get_channel_identifier(channel)
full_channel = self.client(GetFullChannelRequest(channel = username))
full_channel = self.client(GetFullChannelRequest(channel=username))
profile = full_channel.to_dict()
return RawChannelInfo(scraper=self.__version__,
return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc))
date_archived=datetime.now(timezone.utc),
)

View File

@@ -2,4 +2,4 @@ from .base import ETLController
from .bitchute import BitchuteTransformer
from .telegram_telethon import TelegramTelethonTransformer
from .rumble import RumbleTransformer
from .gettr import GettrTransformer
from .gettr import GettrTransformer

View File

@@ -7,7 +7,18 @@ from sqlalchemy.sql.expression import func
from collections import defaultdict
from datetime import datetime, timezone
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry, Image, Video, Audio
from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Media,
Channel,
mapper_registry,
Image,
Video,
Audio,
)
class Transformer:
@@ -35,7 +46,9 @@ class Transformer:
pass
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
def transform(
data: ScraperResult, insert: Callable
) -> Generator[Union[Post, Channel, Media], None, None]:
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
objects and provide all of these to be inserted into the database.
@@ -52,8 +65,8 @@ class Transformer:
pass
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable):
"""Transform a post's media attachment to standard form and insert into database.
"""Transform a post's media attachment to standard form and insert into database.
Parameters
----------
data: cisticola.base.ScraperResult
@@ -62,21 +75,32 @@ class Transformer:
Transformed post data of post that media file was attached to
insert: Callable
A function that either inserts the object into a database or finds an object with the
relevant unique constraints if applicable.
relevant unique constraints if applicable.
"""
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
filename = archived_url.split('/')[-1]
ext = None if '.' not in filename else filename.split('.')[-1].lower()
filename = archived_url.split("/")[-1]
ext = None if "." not in filename else filename.split(".")[-1].lower()
media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
media_kwargs = dict(
url=archived_url,
post=transformed.id,
raw_id=data.id,
original_url=k,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
transformer=self.__version__,
scraper=data.scraper,
platform=data.platform,
)
if ext in ('mp4', 'mov', 'avi', 'mkv'):
if ext in ("mp4", "mov", "avi", "mkv"):
media_class = Video
elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'):
elif ext in ("oga", "mp3", "wav", "aif", "aiff", "aac"):
media_class = Audio
elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'):
elif ext in ("jpg", "jpeg", "png", "gif", "bmp", "heic", "tiff"):
media_class = Image
else:
logger.warning(f"Unknown file extension {ext}")
@@ -149,7 +173,7 @@ class ETLController:
Parameters
----------
obj:
obj:
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
session: sqlalchemy.orm.Session
SQLAlchemy Session that interfaces with the database
@@ -182,10 +206,10 @@ class ETLController:
def insert_or_select(self, obj, session, hydrate: bool = True):
"""Insert an object into the database or return an existing object from the database.
Regardless, the resulting object has an `id` attribute that can be referenced later.
Parameters
----------
obj:
obj:
Instance of ORM-mapped class in the ``cisticola.base`` module to be inserted into the database
session: sqlalchemy.orm.Session
SQLAlchemy Session that interfaces with the database
@@ -202,12 +226,32 @@ class ETLController:
# This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel:
instance = session.query(Channel).filter(
(((Channel.url==obj.url)&(Channel.url!='')&(Channel.url is not None)&(Channel.url!='https://t.me/s/'))|
((Channel.platform_id==str(obj.platform_id))&(Channel.platform_id!='')&(Channel.platform_id is not None))|
((Channel.screenname==obj.screenname)&(Channel.screenname!='')&(Channel.screenname is not None)))&
(Channel.platform==obj.platform)).first()
instance = (
session.query(Channel)
.filter(
(
(
(Channel.url == obj.url)
& (Channel.url != "")
& (Channel.url is not None)
& (Channel.url != "https://t.me/s/")
)
| (
(Channel.platform_id == str(obj.platform_id))
& (Channel.platform_id != "")
& (Channel.platform_id is not None)
)
| (
(Channel.screenname == obj.screenname)
& (Channel.screenname != "")
& (Channel.screenname is not None)
)
)
& (Channel.platform == obj.platform)
)
.first()
)
elif type(obj) == Post:
return self.insert_post(obj, session, hydrate)
# instance = session.query(Post).filter_by(platform=obj.platform, platform_id=obj.platform_id).first()
@@ -220,15 +264,15 @@ class ETLController:
# return instance
# instance = session.query(type(obj)).filter_by(original_url=obj.original_url).first()
# # For Media objects we want to duplicate the entry to preserve the relationship with the post.
# # However, we also want to avoid rehydration, hence the code below:
# if instance:
# logger.info(f"Found matching media record, duplicating and inserting for new post")
# session.expunge(instance)
# make_transient(instance)
# instance.id = None
# session.expunge(instance)
# make_transient(instance)
# instance.id = None
# instance.post = obj.post
# instance.raw_id = obj.raw_id
@@ -240,18 +284,23 @@ class ETLController:
logger.info(f"Found matching DB entry for {obj}: {instance}")
if type(obj) == Channel:
if obj.source != instance.source and obj.source == 'linked_channel' and instance.source != 'researcher' and (instance.source is None or instance.source[:4] != 'snow'):
if (
obj.source != instance.source
and obj.source == "linked_channel"
and instance.source != "researcher"
and (instance.source is None or instance.source[:4] != "snow")
):
logger.info(f"Updating source to linked channel")
instance.source = obj.source
instance.notes = obj.notes
instance.category = obj.category
instance.country = obj.country
instance.influencer = obj.influencer
session.flush()
session.commit()
if (instance.platform_id is None or instance.platform_id == ''):
if instance.platform_id is None or instance.platform_id == "":
instance.platform_id = obj.platform_id
session.flush()
session.commit()
@@ -293,22 +342,35 @@ class ETLController:
handled = False
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
logger.trace(
f"{transformer} is handling result {result.id} ({result.date})"
)
handled = True
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session, lambda obj: self.insert_post(obj, session, hydrate, flush=False), lambda: self.flush_posts(session))
transformer.transform(
result,
lambda obj: self.insert_or_select(obj, session, hydrate),
session,
lambda obj: self.insert_post(
obj, session, hydrate, flush=False
),
lambda: self.flush_posts(session),
)
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
logger.warning(
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
)
self.flush_posts(session)
session.commit()
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True, min_date=datetime(2010, 1, 1)):
def transform_all_untransformed(
self, hydrate: bool = True, min_date=datetime(2010, 1, 1)
):
"""Transform all ScraperResult objects in the database that do not have an
equivalent Post object stored.
@@ -331,7 +393,8 @@ class ETLController:
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
batch = (session.query(ScraperResult)
batch = (
session.query(ScraperResult)
.join(Post, isouter=True)
.where(ScraperResult.date > min_date)
.where(Post.raw_id == None)
@@ -344,22 +407,24 @@ class ETLController:
self.transform_results(batch, hydrate=hydrate)
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}")
batch = (session.query(ScraperResult)
.join(Post, isouter=True)
.where(ScraperResult.date > min_date)
.where(Post.raw_id == None)
.where(ScraperResult.id != batch[-1].id)
.where(ScraperResult.date >= batch[-1].date)
.order_by(ScraperResult.date.asc())
.limit(BATCH_SIZE)
).all()
logger.info(
f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}"
)
batch = (
session.query(ScraperResult)
.join(Post, isouter=True)
.where(ScraperResult.date > min_date)
.where(Post.raw_id == None)
.where(ScraperResult.id != batch[-1].id)
.where(ScraperResult.date >= batch[-1].date)
.order_by(ScraperResult.date.asc())
.limit(BATCH_SIZE)
).all()
@logger.catch(reraise=True)
def transform_info(self, results: List[ChannelInfo]):
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
Parameters
----------
@@ -380,17 +445,25 @@ class ETLController:
for transformer in self.transformers:
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
logger.trace(
f"{transformer} is handling raw info result {result.id} ({result.date_archived})"
)
handled = True
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session, channel=data.Channel)
transformer.transform_info(
result,
lambda obj: self.insert_or_select(obj, session, False),
session,
channel=data.Channel,
)
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
logger.warning(
f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})"
)
@logger.catch(reraise=True)
def transform_all_untransformed_info(self):
@@ -407,21 +480,26 @@ class ETLController:
offset = 0
batch = []
query = (session.query(RawChannelInfo, Channel)
query = (
session.query(RawChannelInfo, Channel)
.select_from(RawChannelInfo)
.join(ChannelInfo, isouter=True)
.join(Channel, RawChannelInfo.channel==Channel.id)
.join(Channel, RawChannelInfo.channel == Channel.id)
.where(ChannelInfo.id == None)
.order_by(RawChannelInfo.date_archived.asc())
)
while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}")
logger.info(
f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}"
)
batch = query.slice(offset, offset + BATCH_SIZE).all()
offset += BATCH_SIZE
logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)")
logger.info(
f"Found {len(batch)} info items to ETL ({offset} already processed)"
)
self.transform_info(batch)
@@ -450,16 +528,24 @@ class ETLController:
handled = False
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
logger.trace(
f"{transformer} is handling result {result.id} ({result.date})"
)
handled = True
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate))
transformer.transform_media(
result,
total_result.Post,
lambda obj: self.insert_or_select(obj, session, hydrate),
)
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
logger.warning(
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
)
@logger.catch(reraise=True)
def transform_all_untransformed_media(self, hydrate=True):
@@ -482,10 +568,15 @@ class ETLController:
logger.info(f"Fetching first untransformed post media batch of {BATCH_SIZE}")
batch = (session.query(ScraperResult, Post)
batch = (
session.query(ScraperResult, Post)
.join(Post)
.join(Media, isouter=True)
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
.filter(
(ScraperResult.media_archived != None)
& (cast(ScraperResult.archived_urls, String) != "{}")
& (Media.id == None)
)
.order_by(ScraperResult.date.desc())
.limit(BATCH_SIZE)
).all()
@@ -495,13 +586,23 @@ class ETLController:
self.transform_media(batch, hydrate=hydrate)
logger.info(f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}")
logger.info(
f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}"
)
batch = (session.query(ScraperResult, Post)
batch = (
session.query(ScraperResult, Post)
.join(Post)
.join(Media, isouter=True)
.where(ScraperResult.date <= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date)
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
.where(
ScraperResult.date
<= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date
)
.filter(
(ScraperResult.media_archived != None)
& (cast(ScraperResult.archived_urls, String) != "{}")
& (Media.id == None)
)
.order_by(ScraperResult.date.desc())
.limit(BATCH_SIZE)
).all()

View File

@@ -4,10 +4,20 @@ from typing import Generator, Union, Callable
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import (
RawChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
ChannelInfo,
)
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -15,61 +25,86 @@ class BitchuteTransformer(Transformer):
__version__ = "BitchuteTransformer 0.0.2"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
scraper = data.scraper.split(" ")
if scraper[0] == "BitchuteScraper":
return True
return False
return False
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable) -> Generator[Media, None, None]:
def transform_media(
self, data: ScraperResult, transformed: Post, insert: Callable
) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
orig = raw["video_url"]
new = data.archived_urls[orig]
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
m = Video(
url=new,
post=transformed.id,
raw_id=data.id,
original_url=orig,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
transformer=self.__version__,
scraper=data.scraper,
platform=data.platform,
)
insert(m)
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['owner_url'].strip('/').split('/')[-1],
platform_id=raw["owner_url"].strip("/").split("/")[-1],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['owner_name'],
name=raw['owner_name'],
description=raw['description'],
description_url='', # does not exist for Bitchute
description_location='', # does not exist for Bitchute
followers=raw['subscribers'],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=parse_created(raw['created'], data.date_archived),
screenname=raw["owner_name"],
name=raw["owner_name"],
description=raw["description"],
description_url="", # does not exist for Bitchute
description_location="", # does not exist for Bitchute
followers=raw["subscribers"],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=parse_created(raw["created"], data.date_archived),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
date_transformed=datetime.now(timezone.utc),
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw['category'] == 'comment':
if raw['parent_id'] is None:
reply_to_id = raw['thread_id']
if raw["category"] == "comment":
if raw["parent_id"] is None:
reply_to_id = raw["thread_id"]
else:
reply_to_id = raw['parent_id']
reply_to_id = raw["parent_id"]
flush_posts()
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id)
.first()
)
if post is None:
if raw['parent_id'] is not None:
# this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
if raw["parent_id"] is not None:
# this block is for comments whose parent_ids correspond to deleted comments
post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=raw["thread_id"])
.first()
)
if post is None:
reply_to = -1
else:
@@ -78,18 +113,18 @@ class BitchuteTransformer(Transformer):
reply_to = -1
else:
reply_to = post.id
content = raw['body'].strip()
content = raw["body"].strip()
else:
reply_to = -1
soup = BeautifulSoup(raw['body'], features = 'html.parser')
soup.find('div', {'class': 'teaser'}).decompose()
soup.find('span', {'class': 'more'}).decompose()
soup.find('span', {'class': 'less hidden'}).decompose()
soup = BeautifulSoup(raw["body"], features="html.parser")
soup.find("div", {"class": "teaser"}).decompose()
soup.find("span", {"class": "more"}).decompose()
soup.find("span", {"class": "less hidden"}).decompose()
content = soup.text.strip()
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
platform_id=raw["id"],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
@@ -97,41 +132,52 @@ class BitchuteTransformer(Transformer):
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'] if raw['url'] else None,
url=raw["url"] if raw["url"] else None,
content=content,
author_id=raw['author_id'],
author_username=raw['author'],
author_id=raw["author_id"],
author_username=raw["author"],
reply_to=reply_to,
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
likes = raw['likes'],
views = int(raw['views']) if raw.get('views') else None,
video_title = raw['subject'],
video_duration = _parse_duration_str(raw['length']))
hashtags=list(
filter(None, [h.strip("#") for h in raw["hashtags"].split(",")])
),
likes=raw["likes"],
views=int(raw["views"]) if raw.get("views") else None,
video_title=raw["subject"],
video_duration=_parse_duration_str(raw["length"]),
)
# insert_post
transformed = insert_post(transformed)
def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
object relative to the specified ``date_archived``.
"""
try:
# handle case where `created` string has already been parsed into a datetime
return datetime.fromisoformat(created)
except ValueError:
period_list = ['year', 'month', 'week', 'day']
period_list = ["year", "month", "week", "day"]
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
periods = [
period.strip() for period in created.split("ago")[0].strip().split(",")
]
_kwargs = {
period: int(number)
for period, number in dict(reversed(p.split(" ")) for p in periods).items()
}
kwargs = {(k + "s" if k in period_list else k): v for k, v in _kwargs.items()}
return date_archived - relativedelta(**kwargs)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
duration_list = duration_str.split(":")
return sum(
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
)

View File

@@ -7,8 +7,18 @@ from sqlalchemy import func
from gogettr import PublicClient
from gogettr.api import GettrApiError
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
from cisticola.transformer.base import Transformer
from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
)
class GettrTransformer(Transformer):
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
@@ -16,86 +26,104 @@ class GettrTransformer(Transformer):
__version__ = "GettrTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
scraper = data.scraper.split(" ")
if scraper[0] == "GettrScraper":
return True
return False
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['_id'],
platform_id=raw["_id"],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['nickname'],
description=raw.get('dsc'),
description_url=raw.get('website'),
description_location=raw.get('location'),
followers=int(raw['flg']),
following=int(raw['flw']),
verified=True if raw.get('infl') else False,
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
screenname=raw["username"],
name=raw["nickname"],
description=raw.get("dsc"),
description_url=raw.get("website"),
description_location=raw.get("location"),
followers=int(raw["flg"]),
following=int(raw["flw"]),
verified=True if raw.get("infl") else False,
date_created=datetime.fromtimestamp(int(raw["cdate"]) * 0.001),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
date_transformed=datetime.now(timezone.utc),
)
transformed = insert(transformed)
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first()
channel = (
session.query(Channel)
.where(
(func.lower(Channel.screenname) == func.lower(username))
& (Channel.platform == "Gettr")
)
.first()
)
if channel is None:
try:
client = PublicClient()
profile = client.user_info(username.lower())
screenname = profile.get('_id')
screenname = profile.get("_id")
channel = Channel(
name=profile.get('nickname'),
name=profile.get("nickname"),
platform_id=screenname,
platform='Gettr',
platform="Gettr",
url="https://gettr.com/user/" + screenname,
screenname=screenname,
category=category,
source=self.__version__,
)
)
except GettrApiError:
channel = Channel(
name = None,
platform_id = None,
platform = 'Gettr',
url = None,
name=None,
platform_id=None,
platform="Gettr",
url=None,
screenname=username,
category=category,
source=self.__version__,
notes='GettrApiError'
)
notes="GettrApiError",
)
channel = insert(channel)
return channel.id
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst":
forwarded_from = self._get_channel_id(
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
username=str(raw["activity"]["uid"]),
category="forwarded",
insert=insert,
session=session,
)
else:
forwarded_from = None
mentions = []
for mentioned_user in raw.get("utgs", []):
mentioned_id = self._get_channel_id(
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
username=mentioned_user,
category="mentioned",
insert=insert,
session=session,
)
mentions.append(mentioned_id)
transformed = Post(
raw_id=data.id,
platform_id=raw["_id"],
@@ -111,17 +139,17 @@ class GettrTransformer(Transformer):
author_id=raw["receiver_id"],
author_username=raw["uid"],
hashtags=raw.get("htgs", []),
outlinks = list(filter(None, [raw.get("prevsrc")])),
forwarded_from = forwarded_from,
mentions = mentions,
likes = raw.get('lkbpst'),
forwards = raw.get("shbpst"),
views = raw.get('vfpst')
)
outlinks=list(filter(None, [raw.get("prevsrc")])),
forwarded_from=forwarded_from,
mentions=mentions,
likes=raw.get("lkbpst"),
forwards=raw.get("shbpst"),
views=raw.get("vfpst"),
)
# insert_post
insert_post(transformed)
# media = self.process_media(raw, transformed.id, data)
# for m in media:
# insert(m)
# insert(m)

View File

@@ -5,8 +5,18 @@ import dateutil.parser
from datetime import datetime, timezone
from sqlalchemy import func, JSON, String, cast, text
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
from cisticola.transformer.base import Transformer
from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
)
class RumbleTransformer(Transformer):
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
@@ -14,25 +24,36 @@ class RumbleTransformer(Transformer):
__version__ = "RumbleTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
scraper = data.scraper.split(" ")
if scraper[0] == "RumbleScraper":
return True
return False
return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if 'id' not in raw:
if "id" not in raw:
# The first version of the Rumble ChannelInfo scraper didn't return
# the platform_id, so this is a workaround.
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
channel = (
session.query(RawChannelInfo)
.filter(
text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"),
RawChannelInfo.platform == "Rumble",
)
.params(name=raw["name"])
.order_by(RawChannelInfo.date_archived.desc())
.first()
)
if channel is None:
platform_id = None
else:
platform_id = json.loads(channel.raw_data)['id']
platform_id = json.loads(channel.raw_data)["id"]
else:
platform_id = raw['id']
platform_id = raw["id"]
transformed = ChannelInfo(
raw_channel_info_id=data.id,
@@ -42,63 +63,67 @@ class RumbleTransformer(Transformer):
scraper=data.scraper,
transformer=self.__version__,
screenname=platform_id,
name=raw['name'],
description='', # does not exist for Rumble
description_url='', # does not exist for Rumble
description_location='', # does not exist for Rumble
followers=_process_number(raw['subscribers']),
following=-1, # does not exist for Rumble
verified=raw['verified'],
date_created=None, # does not exist for Rumble
name=raw["name"],
description="", # does not exist for Rumble
description_url="", # does not exist for Rumble
description_location="", # does not exist for Rumble
followers=_process_number(raw["subscribers"]),
following=-1, # does not exist for Rumble
verified=raw["verified"],
date_created=None, # does not exist for Rumble
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
date_transformed=datetime.now(timezone.utc),
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
raw_id=data.id,
platform_id=raw['media_url'].strip('/').split('/')[-1],
platform_id=raw["media_url"].strip("/").split("/")[-1],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=dateutil.parser.parse(raw['datetime']),
date=dateutil.parser.parse(raw["datetime"]),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['link'],
content=raw['content'],
author_id=raw['author_id'],
author_username=raw['author_name'],
views = _process_number(raw.get('views')),
likes = _process_number(raw.get('rumbles')),
video_title = raw['title'],
video_duration=_parse_duration_str(raw['duration']))
url=raw["link"],
content=raw["content"],
author_id=raw["author_id"],
author_username=raw["author_name"],
views=_process_number(raw.get("views")),
likes=_process_number(raw.get("rumbles")),
video_title=raw["title"],
video_duration=_parse_duration_str(raw["duration"]),
)
# insert_post
insert_post(transformed)
def _process_number(s):
def _process_number(s):
if s is None:
return None
else:
s = s.replace(' ', '').replace(',','')
if s.endswith('M'):
s = s.replace(" ", "").replace(",", "")
if s.endswith("M"):
return int(float(s[:-1]) * 1e6)
elif s.endswith('K'):
elif s.endswith("K"):
return int(float(s[:-1]) * 1000)
return int(s)
def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
"""
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
if not duration_str:
return None
else:
duration_list = duration_str.split(':')
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
duration_list = duration_str.split(":")
return sum(
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
)

View File

@@ -16,12 +16,22 @@ import os
from datetime import datetime, timezone
from sqlalchemy import func
from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Audio, Media, Channel
from cisticola.transformer.base import Transformer
from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Audio,
Media,
Channel,
)
class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.4'
__version__ = "TelegramTelethonTransformer 0.0.4"
# TODO cache
# cache channels for which we cannot get the name from the web interface
@@ -38,18 +48,18 @@ class TelegramTelethonTransformer(Transformer):
get_screenname_cache = {}
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
scraper = data.scraper.split(" ")
if scraper[0] == "TelegramTelethonScraper":
return True
return False
def __init__(self, telethon_session_name = None):
return False
def __init__(self, telethon_session_name=None):
super().__init__()
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ["TELEGRAM_PHONE"]
if telethon_session_name is None:
telethon_session_name = phone
@@ -63,11 +73,15 @@ class TelegramTelethonTransformer(Transformer):
return self.get_screenname_cache[channel_id]
else:
output = ("", "", None)
try:
data = self.client.get_entity(channel_id)
if isinstance(data, types.User):
output = (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "")
output = (
data.username,
str(data.first_name or "") + " " + str(data.last_name or ""),
"",
)
else:
output = (data.username, data.title, "")
except ChannelPrivateError:
@@ -85,7 +99,9 @@ class TelegramTelethonTransformer(Transformer):
# this doesn't work for chat channels
if orig_screenname in self.bad_channels:
logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}")
logger.debug(
f"Skipping screenname because it is not accessible for channel {orig_screenname}"
)
return ""
logger.info(f"Finding channel from URL {url}")
@@ -95,8 +111,8 @@ class TelegramTelethonTransformer(Transformer):
self.bad_channels[orig_screenname] = True
return ""
soup = BeautifulSoup(r.content, features = 'lxml')
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
soup = BeautifulSoup(r.content, features="lxml")
post = soup.findAll("div", {"data-post": orig_screenname + "/" + str(id)})
name = ""
# multiple posts can be combined into one result in the web interface
@@ -106,127 +122,173 @@ class TelegramTelethonTransformer(Transformer):
if decrement > 8:
break
logger.info(f"Could not find post from {url}, looking for id {id - decrement}")
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)})
logger.info(
f"Could not find post from {url}, looking for id {id - decrement}"
)
post = soup.findAll(
"div", {"data-post": orig_screenname + "/" + str(id - decrement)}
)
if len(post) == 0:
logger.warning(f"Could not find post from {url}")
else:
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})
fwd_tag = post[0].findAll(
"a", {"class", "tgme_widget_message_forwarded_from_name"}
)
if len(fwd_tag) == 0:
fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"})
fwd_tag = post[0].findAll(
"span", {"class", "tgme_widget_message_forwarded_from_name"}
)
if len(fwd_tag) >= 1:
name = fwd_tag[0].text
return name
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
chat_raw = raw['chats'][0]
chat_raw = raw["chats"][0]
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['full_chat']['id'],
platform_id=raw["full_chat"]["id"],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=chat_raw['username'],
name=chat_raw['title'],
description=raw['full_chat']['about'],
description_url='', # does not exist for Telegram
description_location='', # does not exist for Telegram
followers=raw['full_chat']['participants_count'],
following=-1, # does not exist for Telegram
verified=False, #does not exist for Telegram
date_created=dateutil.parser.parse(chat_raw['date']),
screenname=chat_raw["username"],
name=chat_raw["title"],
description=raw["full_chat"]["about"],
description_url="", # does not exist for Telegram
description_location="", # does not exist for Telegram
followers=raw["full_chat"]["participants_count"],
following=-1, # does not exist for Telegram
verified=False, # does not exist for Telegram
date_created=dateutil.parser.parse(chat_raw["date"]),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
date_transformed=datetime.now(timezone.utc),
)
transformed = insert(transformed)
if channel.platform_id is None:
logger.info(f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}")
logger.info(
f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}"
)
new_channel = session.query(Channel).where(Channel.id == channel.id).one()
new_channel.platform_id = raw['full_chat']['id']
new_channel.platform_id = raw["full_chat"]["id"]
session.flush()
session.commit()
if len(raw['chats']) > 1:
for chat in raw['chats'][1:]:
if len(raw["chats"]) > 1:
for chat in raw["chats"][1:]:
new_chat = Channel(
name=chat["title"],
platform_id=chat["id"],
category=channel.category, # this should be the same as the "parent"
platform=channel.platform, # this should be the same as the "parent"
url=("https://t.me/s/" + chat["username"]) if "username" in chat else "",
category=channel.category, # this should be the same as the "parent"
platform=channel.platform, # this should be the same as the "parent"
url=("https://t.me/s/" + chat["username"])
if "username" in chat
else "",
screenname=chat["username"] if "username" in chat else "",
country=channel.country, # this should be the same as the "parent"
influencer=channel.influencer, # this should be the same as the "parent"
country=channel.country, # this should be the same as the "parent"
influencer=channel.influencer, # this should be the same as the "parent"
public=None,
chat=not chat["broadcast"],
notes=channel.id, # this should be the channel ID of the parent
source="linked_channel"
notes=channel.id, # this should be the channel ID of the parent
source="linked_channel",
)
insert(new_chat)
# TODO this method API is chaotic and could be cleaned up
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
if raw['_'] != 'Message':
if raw["_"] != "Message":
logger.warning(f"Cannot convert type {raw['_']} to post")
return
fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
if (
raw["fwd_from"]
and raw["fwd_from"]["from_id"]
and "channel_id" in raw["fwd_from"]["from_id"]
):
# use cache to look up channel instead of a DB request if possible
if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid:
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
if (
str(raw["fwd_from"]["from_id"]["channel_id"])
not in self.channels_cache_by_platformid
):
channel = (
session.query(Channel)
.filter_by(
platform_id=str(raw["fwd_from"]["from_id"]["channel_id"]),
platform="Telegram",
)
.first()
)
if channel is None:
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
(screenname, name, notes) = self.get_screenname_from_id(
raw["fwd_from"]["from_id"]["channel_id"]
)
if name == "":
logger.info("Trying fallback web interface")
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
orig_channel = (
session.query(Channel).filter_by(id=data.channel).first()
)
if orig_channel.screenname is not None:
name = self.get_name_from_web_interface(orig_channel.screenname, raw['id'])
name = self.get_name_from_web_interface(
orig_channel.screenname, raw["id"]
)
channel = Channel(
name=name,
platform_id=raw['fwd_from']['from_id']['channel_id'],
platform_id=raw["fwd_from"]["from_id"]["channel_id"],
platform=data.platform,
url="https://t.me/s/" + screenname if screenname is not None else "",
url="https://t.me/s/" + screenname
if screenname is not None
else "",
screenname=screenname,
category='forwarded',
category="forwarded",
source=self.__version__,
notes=notes
)
notes=notes,
)
channel = insert(channel)
logger.info(f"Added {channel}")
self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])] = channel
fwd_from = self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])].id
self.channels_cache_by_platformid[
str(raw["fwd_from"]["from_id"]["channel_id"])
] = channel
fwd_from = self.channels_cache_by_platformid[
str(raw["fwd_from"]["from_id"]["channel_id"])
].id
reply_to = None
if raw['reply_to']:
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
if raw["reply_to"]:
reply_to_id = str(raw["reply_to"]["reply_to_msg_id"])
# use cache to find post ID instead of a DB request, if possible
if (data.channel, reply_to_id) not in self.posts_cache:
session.commit()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id)
.first()
)
if post is None:
reply_to = -1
else:
@@ -238,31 +300,42 @@ class TelegramTelethonTransformer(Transformer):
mentions = []
for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']:
offset = mention_entity['offset']
length = mention_entity['length']
for mention_entity in [
entity
for entity in raw["entities"]
if entity["_"] == "MessageEntityMention"
]:
offset = mention_entity["offset"]
length = mention_entity["length"]
screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip()
screenname = (
add_surrogate(raw["message"])[offset : offset + length]
.strip("@")
.strip()
)
# use cache rather than a DB request if possible
if screenname.lower() not in self.channels_cache_by_screenname:
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first()
channel = (
session.query(Channel)
.filter(func.lower(Channel.screenname) == func.lower(screenname))
.first()
)
if channel is None:
channel = Channel(
name = None,
platform_id = None,
platform = 'Telegram',
name=None,
platform_id=None,
platform="Telegram",
url="https://t.me/s/" + screenname,
screenname=screenname,
category='mentioned',
category="mentioned",
source=self.__version__,
)
)
channel = insert(channel)
logger.info(f"Added {channel}")
self.channels_cache_by_screenname[screenname.lower()] = channel
channel = self.channels_cache_by_screenname[screenname.lower()]
@@ -277,28 +350,28 @@ class TelegramTelethonTransformer(Transformer):
channel = self.channels_cache_by_id[int(data.channel)]
if channel is not None and channel.url:
url = channel.url.strip('/') + f"/{raw['id']}"
url = channel.url.strip("/") + f"/{raw['id']}"
author_username = channel.screenname
else:
url = ""
author_username = ""
author_id = raw.get('peer_id', {}).get('channel_id')
if raw['from_id'] and 'user_id' in raw['from_id']:
author_id = raw['from_id']['user_id']
author_id = raw.get("peer_id", {}).get("channel_id")
if raw["from_id"] and "user_id" in raw["from_id"]:
author_id = raw["from_id"]["user_id"]
author_username = ""
(screenname, name, notes) = self.get_screenname_from_id(author_id)
if screenname:
author_username = screenname
transformed = Post(
raw_id = data.id,
platform_id = raw['id'],
scraper = data.scraper,
raw_id=data.id,
platform_id=raw["id"],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=dateutil.parser.parse(raw['date']),
date=dateutil.parser.parse(raw["date"]),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=url,
@@ -307,48 +380,57 @@ class TelegramTelethonTransformer(Transformer):
author_username=author_username,
forwarded_from=fwd_from,
reply_to=reply_to,
mentions = mentions,
forwards = raw.get('forwards'),
views = raw.get('views')
mentions=mentions,
forwards=raw.get("forwards"),
views=raw.get("views"),
)
# insert_post
insert_post(transformed)
def stripped(s):
"""https://stackoverflow.com/a/29933716"""
lstripped = ''.join(takewhile(str.isspace, s))
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
lstripped = "".join(takewhile(str.isspace, s))
rstripped = "".join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
return lstripped + rstripped
def add_markdown_links(raw_post):
"""This function is necessary because Telethon's markdown.unparse doesn't
"""This function is necessary because Telethon's markdown.unparse doesn't
correctly handle trailing whitespace or multi-line links"""
global_offset = 0
transformed_content = add_surrogate(raw_post['message'])
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
transformed_content = add_surrogate(raw_post["message"])
links = [
entity
for entity in raw_post["entities"]
if entity["_"] == "MessageEntityTextUrl"
]
for link in links:
offset = global_offset + link['offset']
length = link['length']
url = link['url']
offset = global_offset + link["offset"]
length = link["length"]
url = link["url"]
before_link = transformed_content[:offset]
inner_text = transformed_content[offset:offset+length]
# skip creation of link if inner link text is only whitespace
if inner_text.replace('\u200b', '').strip():
processed_inner_text = inner_text.strip().replace('\n', '\\\n')
link_text = f"[{processed_inner_text}]"
trailing_whitespace = stripped(transformed_content[offset:offset+length])
link_href = f"({url})"
after_link = transformed_content[offset+length:]
inner_text = transformed_content[offset : offset + length]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
global_offset += (4 + len(url) + inner_text.strip().count('\n'))
return del_surrogate(transformed_content)
# skip creation of link if inner link text is only whitespace
if inner_text.replace("\u200b", "").strip():
processed_inner_text = inner_text.strip().replace("\n", "\\\n")
link_text = f"[{processed_inner_text}]"
trailing_whitespace = stripped(
transformed_content[offset : offset + length]
)
link_href = f"({url})"
after_link = transformed_content[offset + length :]
transformed_content = (
before_link + link_text + link_href + trailing_whitespace + after_link
)
global_offset += 4 + len(url) + inner_text.strip().count("\n")
return del_surrogate(transformed_content)

View File

@@ -2,8 +2,8 @@ import requests
from loguru import logger
import time
def make_request(url, headers = None, max_retries = 5, break_codes = None):
def make_request(url, headers=None, max_retries=5, break_codes=None):
"""Retry request `max_retries` times, while catching arbitrary exceptions.
Parameters
@@ -13,11 +13,11 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
headers : dict or None
Dictionary of key-value pairs for request headers
max_retries : int
Maximum number of times to retry the request
Maximum number of times to retry the request
break_codes : list or None
List of acceptable status codes that indicate that the request should
not be retried further. Useful if, for example, a `404` is expected at
some point to terminate a loop, and we don't want to retry to get the
List of acceptable status codes that indicate that the request should
not be retried further. Useful if, for example, a `404` is expected at
some point to terminate a loop, and we don't want to retry to get the
404-ed page multiple times.
Returns
@@ -33,20 +33,17 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
try:
r = request_until_200(
url = url,
headers = headers,
max_retries = max_retries,
break_codes = break_codes)
url=url, headers=headers, max_retries=max_retries, break_codes=break_codes
)
logger.debug(f"Request for url: {url} succeeded")
except Exception as e:
logger.warning(f"Request for url: {url} raised exception: [{e}]")
return r
def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
"""Retry request `max_retries` times, or until the request is successful.
"""
def request_until_200(url, headers=None, max_retries=5, break_codes=None):
"""Retry request `max_retries` times, or until the request is successful."""
if break_codes is None:
break_codes = [200]
@@ -54,17 +51,21 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
break_codes = break_codes + [200]
n_retries = 0
r = requests.get(url, headers = headers)
r = requests.get(url, headers=headers)
while r.status_code not in break_codes and n_retries < 5:
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
logger.warning(
f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}"
)
n_retries += 1
# back off subsequent requests
time.sleep(n_retries)
r = requests.get(url, headers = headers)
r = requests.get(url, headers=headers)
if r.status_code not in break_codes:
raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts")
raise ValueError(
f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts"
)
return r
return r

View File

@@ -12,14 +12,15 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath('../../'))
sys.path.insert(0, os.path.abspath("../../"))
# -- Project information -----------------------------------------------------
project = 'Cisticola'
copyright = '2022, Bellingcat'
author = 'Bellingcat'
project = "Cisticola"
copyright = "2022, Bellingcat"
author = "Bellingcat"
# -- General configuration ---------------------------------------------------
@@ -27,10 +28,10 @@ author = 'Bellingcat'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon']
extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
@@ -43,7 +44,7 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
@@ -52,9 +53,9 @@ html_static_path = []
# -- Default flags for autodoc------------------------------------------------
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
autodoc_default_options = {"exclude-members": "_sa_class_manager"}
html_favicon = '../images/favicon.ico'
html_logo = '../images/cisticola_logo.svg'
html_favicon = "../images/favicon.ico"
html_logo = "../images/cisticola_logo.svg"
html_theme_options = {'style_nav_header_background': '#292a2b'}
html_theme_options = {"style_nav_header_background": "#292a2b"}

View File

@@ -20,10 +20,12 @@ expected_headers = [
"chat",
"notes",
"normalized_url",
"to_remove"]
"to_remove",
]
def standardize_country(s):
_s = s.split('(')[0].split('?')[0]
_s = s.split("(")[0].split("?")[0]
return _s.strip()
@@ -33,8 +35,8 @@ def sync_channels(args, session):
gc = gspread.service_account(filename="service_account.json")
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url(os.environ['GSHEET']).worksheet("channels")
channels = wks.get_all_records(expected_headers = expected_headers)
wks = gc.open_by_url(os.environ["GSHEET"]).worksheet("channels")
channels = wks.get_all_records(expected_headers=expected_headers)
row = 2
for c in channels:
@@ -67,20 +69,28 @@ def sync_channels(args, session):
channel = (
session.query(Channel)
.filter_by(
platform_id=str(platform_id), platform=str(c["platform"])
)
.filter_by(platform_id=str(platform_id), platform=str(c["platform"]))
.first()
)
if not channel:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first()
channel = (
session.query(Channel)
.filter_by(platform=str(c["platform"]), url=str(c["url"]))
.first()
)
if not channel and c["screenname"] != '' and c["screenname"] is not None:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
if not channel and c["screenname"] != "" and c["screenname"] is not None:
channel = (
session.query(Channel)
.filter_by(
platform=str(c["platform"]), screenname=str(c["screenname"])
)
.first()
)
if not channel:
if all([k in [None, True, False, ''] for k in c.values()]):
if all([k in [None, True, False, ""] for k in c.values()]):
# end sync if completely empty row is encountered
break
@@ -109,7 +119,11 @@ def sync_channels(args, session):
if c["screenname"]:
channel.screenname = c["screenname"]
if c["country"]:
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
channel.country = (
None
if c["country"] is None
else list(map(standardize_country, c["country"].split("/")))
)
if c["influencer"]:
channel.influencer = c["influencer"]
if c["public"]:
@@ -129,23 +143,27 @@ def sync_channels(args, session):
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
if was_researcher:
logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.")
wks.format(f"A{str(row)}:A{str(row)}", {
"backgroundColor": {
"red": 1.0,
"green": 0.0,
"blue": 0.0
}})
time.sleep(1)
logger.warning(
f"This channel (ID {channel.id}) is possibly a duplicate."
)
wks.format(
f"A{str(row)}:A{str(row)}",
{"backgroundColor": {"red": 1.0, "green": 0.0, "blue": 0.0}},
)
time.sleep(1)
# channel has ID
else:
cid = int(c["id"])
channel = session.query(Channel).filter_by(id=cid).first()
channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first()
channel_info = (
session.query(ChannelInfo)
.filter_by(channel=cid)
.order_by(ChannelInfo.date_archived.desc())
.first()
)
logger.info(f"Updating channel {channel}")
logger.info(f"Found info {channel_info}")
@@ -155,7 +173,11 @@ def sync_channels(args, session):
channel.platform = c["platform"]
channel.url = c["url"]
channel.screenname = c["screenname"]
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
channel.country = (
None
if c["country"] is None
else list(map(standardize_country, c["country"].split("/")))
)
channel.influencer = c["influencer"]
channel.public = c["public"]
channel.chat = c["chat"]
@@ -167,7 +189,9 @@ def sync_channels(args, session):
wks.update_cell(row, 7, channel_info.screenname)
time.sleep(1)
if channel_info and str(channel.platform_id) != str(channel_info.platform_id):
if channel_info and str(channel.platform_id) != str(
channel_info.platform_id
):
channel.platform_id = channel_info.platform_id
wks.update_cell(row, 3, channel_info.platform_id)
time.sleep(1)

View File

@@ -1,5 +1,5 @@
import argparse
from telethon.sync import TelegramClient
from telethon.sync import TelegramClient
import os
if __name__ == "__main__":
@@ -8,9 +8,9 @@ if __name__ == "__main__":
args = parser.parse_args()
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ["TELEGRAM_PHONE"]
telethon_session_name = args.telethon_session
if telethon_session_name is None:
@@ -19,4 +19,4 @@ if __name__ == "__main__":
client = TelegramClient(telethon_session_name, api_id, api_hash)
client.start()
client.disconnect()
client.disconnect()

View File

@@ -1,58 +1,64 @@
import pytest
from sqlalchemy.sql import text
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
from cisticola.base import (
Post,
Channel,
ChannelInfo,
Media,
ScraperResult,
RawChannelInfo,
)
from cisticola.scraper import (
TelegramTelethonScraper,
BitchuteScraper,
TelegramTelethonScraper,
BitchuteScraper,
GettrScraper,
RumbleScraper)
RumbleScraper,
)
from cisticola.transformer import (
TelegramTelethonTransformer,
BitchuteTransformer,
TelegramTelethonTransformer,
BitchuteTransformer,
GettrTransformer,
RumbleTransformer)
RumbleTransformer,
)
CONTROLLERS = {
'telegram' : {
'scraper': TelegramTelethonScraper,
'transformer': TelegramTelethonTransformer
"telegram": {
"scraper": TelegramTelethonScraper,
"transformer": TelegramTelethonTransformer,
},
'bitchute': {
'scraper': BitchuteScraper,
'transformer': BitchuteTransformer
},
'gettr': {
'scraper': GettrScraper,
'transformer': GettrTransformer
},
'rumble': {
'scraper': RumbleScraper,
'transformer': RumbleTransformer
}
"bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
"gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
"rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
}
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
@pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
def test_scraper_and_transformer(
platform, session, controller, etl_controller, channel_kwargs
):
controller.reset_db()
controller.remove_all_scrapers()
# necessary for comments/replies to be processed correctly
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
session.commit()
channels = [Channel(**channel_kwargs[platform])]
scraper = CONTROLLERS[platform]['scraper']
controller.register_scraper(scraper = scraper())
scraper = CONTROLLERS[platform]["scraper"]
controller.register_scraper(scraper=scraper())
controller.scrape_channels(channels = channels)
controller.scrape_channels(channels=channels)
controller.scrape_all_channel_info()
controller.archive_unarchived_media_batch()
raw_posts = session.query(ScraperResult).all()
raw_channel_info = session.query(RawChannelInfo).all()
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
archived_urls = (
session.query(ScraperResult.archived_urls)
.order_by(ScraperResult.date_archived.desc())
.first()
)
assert len(raw_posts) > 0
assert len(raw_channel_info) > 0
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
controller.remove_all_scrapers()
transformer = CONTROLLERS[platform]['transformer']
transformer = CONTROLLERS[platform]["transformer"]
etl_controller.register_transformer(transformer())
etl_controller.transform_all_untransformed()
@@ -73,4 +79,4 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
assert len(posts) > 0
assert len(channel_info) > 0
assert len(media) > 0
assert len(media) > 0

View File

@@ -8,170 +8,182 @@ from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController
BITCHUTE_CHANNEL_KWARGS = {
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
'platform_id': 'bestonlinejewelrystoresusagmailcom',
'category': 'test',
'platform': 'Bitchute',
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
'screenname': None,
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "bestonlinejewelrystoresusa@gmail.com (test)",
"platform_id": "bestonlinejewelrystoresusagmailcom",
"category": "test",
"platform": "Bitchute",
"url": "https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/",
"screenname": None,
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
GAB_CHANNEL_KWARGS = {
'name': 'Capt. Marc Simon (test)',
'platform_id': 'marc_capt',
'category': 'test',
'platform': 'Gab',
'url': 'https://gab.com/marc_capt',
'screenname': 'marc_capt',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Capt. Marc Simon (test)",
"platform_id": "marc_capt",
"category": "test",
"platform": "Gab",
"url": "https://gab.com/marc_capt",
"screenname": "marc_capt",
"country": "CA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
GAB_GROUP_KWARGS = {
'name': 'iran group (test)',
'platform_id': "10001",
'category': 'test',
'platform': 'Gab',
'url': 'https://gab.com/groups/10001',
'screenname': 'iran group',
'country': 'IR',
'influencer': None,
'public': True,
'chat': True,
'notes': '',
'source': 'researcher'}
"name": "iran group (test)",
"platform_id": "10001",
"category": "test",
"platform": "Gab",
"url": "https://gab.com/groups/10001",
"screenname": "iran group",
"country": "IR",
"influencer": None,
"public": True,
"chat": True,
"notes": "",
"source": "researcher",
}
GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)',
'platform_id': 'lizardrepublic',
'category': 'test',
'platform': 'Gettr',
'url': 'https://www.gettr.com/user/lizardrepublic',
'screenname': 'lizardrepublic',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "LizardRepublic (test)",
"platform_id": "lizardrepublic",
"category": "test",
"platform": "Gettr",
"url": "https://www.gettr.com/user/lizardrepublic",
"screenname": "lizardrepublic",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
INSTAGRAM_CHANNEL_KWARGS = {
'name': 'borland.88 (test)',
'platform_id': 'borland.88',
'category': 'test',
'platform': 'Instagram',
'url': 'https://www.instagram.com/borland.88/',
'screenname': 'borland.88',
'country': 'UA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "borland.88 (test)",
"platform_id": "borland.88",
"category": "test",
"platform": "Instagram",
"url": "https://www.instagram.com/borland.88/",
"screenname": "borland.88",
"country": "UA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
ODYSEE_CHANNEL_KWARGS = {
'name': "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon',
'category': 'test',
'platform': 'Odysee',
'url': 'https://odysee.com/@Mak1nBacon',
'screenname': 'Mak1nBacon',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Mak1n' Bacon (test)",
"platform_id": "Mak1nBacon",
"category": "test",
"platform": "Odysee",
"url": "https://odysee.com/@Mak1nBacon",
"screenname": "Mak1nBacon",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
RUMBLE_CHANNEL_KWARGS = {
'name': 'we are uploading videos wow products (test)',
'platform_id': 'c-916305',
'category': 'test',
'platform': 'Rumble',
'url': 'https://rumble.com/c/c-916305',
'screenname': 'we are uploading',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "we are uploading videos wow products (test)",
"platform_id": "c-916305",
"category": "test",
"platform": "Rumble",
"url": "https://rumble.com/c/c-916305",
"screenname": "we are uploading",
"country": "CA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'Бутылка (test)',
'platform_id': "-1001760492118",
'category': 'test',
'platform': 'Telegram',
'url': 'https://t.me/butylka1488',
'screenname': 'butylka1488',
'country': 'RU',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Бутылка (test)",
"platform_id": "-1001760492118",
"category": "test",
"platform": "Telegram",
"url": "https://t.me/butylka1488",
"screenname": "butylka1488",
"country": "RU",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
TWITTER_CHANNEL_KWARGS = {
'name': 'L Weber (test)',
'platform_id': "1424979017749442595",
'category': 'test',
'platform': 'Twitter',
'url': 'https://twitter.com/LWeber33662141',
'screenname': 'LWeber33662141',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "L Weber (test)",
"platform_id": "1424979017749442595",
"category": "test",
"platform": "Twitter",
"url": "https://twitter.com/LWeber33662141",
"screenname": "LWeber33662141",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
VKONTAKTE_CHANNEL_KWARGS = {
'name': 'Wwg1wgA (test)',
'platform_id': 'club201278078',
'category': 'test',
'platform': 'Vkontakte',
'url': 'https://vk.com/club201278078',
'screenname': 'Wwg1wgA',
'country': 'FR',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Wwg1wgA (test)",
"platform_id": "club201278078",
"category": "test",
"platform": "Vkontakte",
"url": "https://vk.com/club201278078",
"screenname": "Wwg1wgA",
"country": "FR",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
YOUTUBE_CHANNEL_KWARGS = {
'name': 'AnEs87 (test)',
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
'category': 'test',
'platform': 'Youtube',
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
'screenname': 'AnEs87',
'country': 'SV',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "AnEs87 (test)",
"platform_id": "UCP6exBqGoxGLv_pM9Dxk2pA",
"category": "test",
"platform": "Youtube",
"url": "https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA",
"screenname": "AnEs87",
"country": "SV",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def engine(tmpdir_factory):
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
tests in the package"""
engine = create_engine(os.environ["TEST_DB"])
return engine
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def session(engine):
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
@@ -179,7 +191,8 @@ def session(engine):
sessionfactory.configure(bind=engine)
return sessionfactory()
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def controller(engine):
"""Initialize ScraperController to be used for all tests in the package."""
@@ -188,7 +201,8 @@ def controller(engine):
return scraper_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def etl_controller(engine):
"""Initialize ETLController to be used for all tests in the package."""
@@ -197,21 +211,23 @@ def etl_controller(engine):
return etl_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each
"""Define keyword arguments to use for defining test channels for each
platform to be scraped.
"""
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gab_group' : GAB_GROUP_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
'youtube' : YOUTUBE_CHANNEL_KWARGS}
"bitchute": BITCHUTE_CHANNEL_KWARGS,
"gab": GAB_CHANNEL_KWARGS,
"gab_group": GAB_GROUP_KWARGS,
"gettr": GETTR_CHANNEL_KWARGS,
"instagram": INSTAGRAM_CHANNEL_KWARGS,
"odysee": ODYSEE_CHANNEL_KWARGS,
"rumble": RUMBLE_CHANNEL_KWARGS,
"telegram": TELEGRAM_CHANNEL_KWARGS,
"twitter": TWITTER_CHANNEL_KWARGS,
"vkontakte": VKONTAKTE_CHANNEL_KWARGS,
"youtube": YOUTUBE_CHANNEL_KWARGS,
}