formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue

This commit is contained in:
Tristan Lee
2023-08-04 14:51:00 -05:00
parent 070ee3391d
commit fab65a5d67
25 changed files with 3043 additions and 2176 deletions

10
.github/workflows/black.yml vendored Normal file
View File

@@ -0,0 +1,10 @@
name: Lint
on: [push]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable

6
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,6 @@
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
language_version: python3.9

View File

@@ -24,6 +24,8 @@ ratelimit = "*"
pytz = "*" pytz = "*"
langdetect = "*" langdetect = "*"
spacy = "==3.2.4" spacy = "==3.2.4"
# Temporary fix for https://github.com/explosion/spaCy/issues/12659
typing_extensions = "==4.4.0"
ocrd-pyexiftool = "*" ocrd-pyexiftool = "*"
filelock = "*" filelock = "*"
telethon = "*" telethon = "*"
@@ -38,6 +40,7 @@ pytest-metadata = "*"
black = "*" black = "*"
Sphinx = "*" Sphinx = "*"
sphinx-rtd-theme = "*" sphinx-rtd-theme = "*"
pre-commit = "*"
[requires] [requires]
python_version = "3.9" python_version = "3.9"

2318
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

80
app.py
View File

@@ -10,7 +10,6 @@ import sys
from cisticola.base import mapper_registry from cisticola.base import mapper_registry
from cisticola.scraper import ( from cisticola.scraper import (
ScraperController, ScraperController,
# VkontakteScraper,
TelegramTelethonScraper, TelegramTelethonScraper,
GettrScraper, GettrScraper,
BitchuteScraper, BitchuteScraper,
@@ -22,11 +21,11 @@ from cisticola.transformer import (
GettrTransformer, GettrTransformer,
RumbleTransformer, RumbleTransformer,
BitchuteTransformer, BitchuteTransformer,
# VkontakteTransformer,
) )
from sync_with_gsheet import sync_channels from sync_with_gsheet import sync_channels
def get_db_session(): def get_db_session():
engine = create_engine(os.environ["DB"]) engine = create_engine(os.environ["DB"])
@@ -52,12 +51,14 @@ def get_scraper_controller(args):
TelegramTelethonScraper(telethon_session_name=telethon_session_name), TelegramTelethonScraper(telethon_session_name=telethon_session_name),
GettrScraper(), GettrScraper(),
BitchuteScraper(), BitchuteScraper(),
RumbleScraper()] RumbleScraper(),
]
controller.register_scrapers(scrapers) controller.register_scrapers(scrapers)
return controller return controller
def get_transformer_controller(args): def get_transformer_controller(args):
engine = create_engine(os.environ["DB"]) engine = create_engine(os.environ["DB"])
@@ -73,7 +74,8 @@ def get_transformer_controller(args):
TelegramTelethonTransformer(telethon_session_name=telethon_session_name), TelegramTelethonTransformer(telethon_session_name=telethon_session_name),
GettrTransformer(), GettrTransformer(),
BitchuteTransformer(), BitchuteTransformer(),
RumbleTransformer()] RumbleTransformer(),
]
controller.register_transformers(transformers) controller.register_transformers(transformers)
@@ -86,12 +88,14 @@ def scrape_channels(args):
controller = get_scraper_controller(args) controller = get_scraper_controller(args)
controller.scrape_all_channels() controller.scrape_all_channels()
def scrape_channels_old(args): def scrape_channels_old(args):
logger.info(f"Scraping old posts from channels") logger.info(f"Scraping old posts from channels")
controller = get_scraper_controller(args) controller = get_scraper_controller(args)
controller.scrape_all_channels(fetch_old=True) controller.scrape_all_channels(fetch_old=True)
def scrape_channel_info(args): def scrape_channel_info(args):
logger.info(f"Scraping channel info") logger.info(f"Scraping channel info")
@@ -109,6 +113,7 @@ def archive_media(args):
else: else:
controller.archive_unarchived_media() controller.archive_unarchived_media()
def transform(args): def transform(args):
logger.info(f"Transforming untransformed posts") logger.info(f"Transforming untransformed posts")
@@ -121,6 +126,7 @@ def transform(args):
controller.transform_all_untransformed(min_date=min_date) controller.transform_all_untransformed(min_date=min_date)
def transform_info(args): def transform_info(args):
logger.info(f"Transforming untransformed channel info") logger.info(f"Transforming untransformed channel info")
@@ -129,12 +135,14 @@ def transform_info(args):
# sync_channels(args, get_db_session()) # sync_channels(args, get_db_session())
def transform_media(args): def transform_media(args):
logger.info(f"Transforming untransformed channel media") logger.info(f"Transforming untransformed channel media")
controller = get_transformer_controller(args) controller = get_transformer_controller(args)
controller.transform_all_untransformed_media() controller.transform_all_untransformed_media()
def init_db(): def init_db():
engine = create_engine(os.environ["DB"]) engine = create_engine(os.environ["DB"])
mapper_registry.metadata.create_all(bind=engine) mapper_registry.metadata.create_all(bind=engine)
@@ -162,29 +170,77 @@ if __name__ == "__main__":
if args.command == "init-db": if args.command == "init-db":
init_db() init_db()
elif args.command == "sync-channels": elif args.command == "sync-channels":
logger.add("logs/sync-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/sync-channels.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
sync_channels(args, get_db_session()) sync_channels(args, get_db_session())
elif args.command == "scrape-channels": elif args.command == "scrape-channels":
logger.add("logs/scrape-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/scrape-channels.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channels(args) scrape_channels(args)
elif args.command == "scrape-channels-old": elif args.command == "scrape-channels-old":
logger.add("logs/scrape-channels-old.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/scrape-channels-old.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channels_old(args) scrape_channels_old(args)
elif args.command == "archive-media": elif args.command == "archive-media":
logger.add("logs/archive-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/archive-media.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
archive_media(args) archive_media(args)
elif args.command == "channel-info": elif args.command == "channel-info":
logger.add("logs/channel-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/channel-info.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
scrape_channel_info(args) scrape_channel_info(args)
elif args.command == "transform": elif args.command == "transform":
logger.add("logs/transform.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/transform.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
logger.add("logs/transform_trace.log", level="TRACE", retention="7 days") logger.add("logs/transform_trace.log", level="TRACE", retention="7 days")
transform(args) transform(args)
elif args.command == "transform-info": elif args.command == "transform-info":
logger.add("logs/transform-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/transform-info.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
transform_info(args) transform_info(args)
elif args.command == "transform-media": elif args.command == "transform-media":
logger.add("logs/transform-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip") logger.add(
"logs/transform-media.log",
level="DEBUG",
rotation="100 MB",
retention="2 weeks",
compression="zip",
)
transform_media(args) transform_media(args)
else: else:
logger.error(f"Unrecognized command {args.command}") logger.error(f"Unrecognized command {args.command}")

View File

@@ -6,7 +6,17 @@ import json
import io import io
from sqlalchemy.orm import registry from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index from sqlalchemy import (
Table,
Column,
Integer,
String,
JSON,
DateTime,
ForeignKey,
Boolean,
Index,
)
from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.dialects.postgresql import JSONB
import pytesseract import pytesseract
import PIL import PIL
@@ -22,10 +32,10 @@ from .utils import make_request
# Disable decompression bomb check # Disable decompression bomb check
PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256 PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
@dataclass @dataclass
class ScraperResult: class ScraperResult:
"""Minimally processed set of information from a scraper about one post """Minimally processed set of information from a scraper about one post"""
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str scraper: str
@@ -54,10 +64,10 @@ class ScraperResult:
#: What date was the media archived? (None if not archived) #: What date was the media archived? (None if not archived)
media_archived: datetime media_archived: datetime
@dataclass @dataclass
class Channel: class Channel:
"""Information about a specific channel to be scraped. """Information about a specific channel to be scraped."""
"""
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``. #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str name: str
@@ -98,10 +108,10 @@ class Channel:
def hydrate(self): def hydrate(self):
pass pass
@dataclass @dataclass
class RawChannelInfo: class RawChannelInfo:
"""Minimally processed set of information from a scraper about one channel """Minimally processed set of information from a scraper about one channel"""
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``. #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str scraper: str
@@ -118,10 +128,10 @@ class RawChannelInfo:
#: Datetime (relative to UTC) that the scraped post was archived at. #: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime date_archived: datetime
@dataclass @dataclass
class ChannelInfo: class ChannelInfo:
"""A processed set of information about a channel. """A processed set of information about a channel."""
"""
# Foreign key from the raw_channel_info table # Foreign key from the raw_channel_info table
raw_channel_info_id: int raw_channel_info_id: int
@@ -161,13 +171,15 @@ class ChannelInfo:
def hydrate(self): def hydrate(self):
pass pass
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_en = spacy.load("en_core_web_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_de = spacy.load("de_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_fr = spacy.load('fr_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_it = spacy.load("it_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_fr = spacy.load("fr_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler']) nlp_ru = spacy.load("ru_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_xx = spacy.load('xx_ent_wiki_sm') nlp_nl = spacy.load("nl_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
nlp_xx = spacy.load("xx_ent_wiki_sm")
@dataclass @dataclass
class Post: class Post:
@@ -258,7 +270,9 @@ class Post:
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""" URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
# replace is here in order to prevent catastrophic backtracking # replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", "").replace("........", "")) urls = re.findall(
URL_REGEX, self.content.replace("::::::::", "").replace("........", "")
)
self.outlinks += urls self.outlinks += urls
self.outlinks = list(set(outlink for outlink in self.outlinks)) self.outlinks = list(set(outlink for outlink in self.outlinks))
@@ -269,10 +283,12 @@ class Post:
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags)) self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
# regex patterns for finding crypto addresses # regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b' BTC_REGEX = r"\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b"
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})' ETHER_REGEX = r"(0x[a-fA-F0-9]{40})"
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content) self.cryptocurrency_addresses = [
m[0] for m in re.findall(BTC_REGEX, self.content)
] + re.findall(ETHER_REGEX, self.content)
try: try:
self.detected_language = detect(self.content) self.detected_language = detect(self.content)
@@ -288,17 +304,17 @@ class Post:
def hydrate_spacy(self): def hydrate_spacy(self):
ner_only = False ner_only = False
if self.detected_language == 'en': if self.detected_language == "en":
nlp = nlp_en nlp = nlp_en
elif self.detected_language == 'de': elif self.detected_language == "de":
nlp = nlp_de nlp = nlp_de
elif self.detected_language == 'it': elif self.detected_language == "it":
nlp = nlp_it nlp = nlp_it
elif self.detected_language == 'fr': elif self.detected_language == "fr":
nlp = nlp_fr nlp = nlp_fr
elif self.detected_language == 'ru': elif self.detected_language == "ru":
nlp = nlp_ru nlp = nlp_ru
elif self.detected_language == 'nl': elif self.detected_language == "nl":
nlp = nlp_nl nlp = nlp_nl
else: else:
nlp = nlp_xx nlp = nlp_xx
@@ -307,19 +323,36 @@ class Post:
doc = nlp(self.content) doc = nlp(self.content)
if not ner_only: if not ner_only:
punctuation = ['?',':','!',',','.',';','|','(',')','--','#','=','+'] punctuation = [
tokens = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation] "?",
self.normalized_content = ' '.join(tokens) ":",
"!",
",",
".",
";",
"|",
"(",
")",
"--",
"#",
"=",
"+",
]
tokens = [
t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation
]
self.normalized_content = " ".join(tokens)
else: else:
self.normalized_content = '' self.normalized_content = ""
self.named_entities = [{'text': ent.text, 'type': ent.label_} for ent in doc.ents] self.named_entities = [
{"text": ent.text, "type": ent.label_} for ent in doc.ents
]
@dataclass @dataclass
class Media: class Media:
"""Base class for organizing information about a media file. """Base class for organizing information about a media file."""
"""
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table. #: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int raw_id: int
@@ -355,16 +388,14 @@ class Media:
exif: str = None exif: str = None
def get_blob(self): def get_blob(self):
"""Download media file as bytes blob. """Download media file as bytes blob."""
"""
blob = make_request(self.url) blob = make_request(self.url)
return blob.content return blob.content
@logger.catch @logger.catch
def hydrate(self, blob=None): def hydrate(self, blob=None):
"""Download media file as bytes blob and extract data from content. """Download media file as bytes blob and extract data from content."""
"""
if blob is None: if blob is None:
blob = self.get_blob() blob = self.get_blob()
@@ -372,8 +403,7 @@ class Media:
self.hydrate_exif(blob) self.hydrate_exif(blob)
def hydrate_exif(self, blob): def hydrate_exif(self, blob):
"""Extract Exif metadata from bytes blob. """Extract Exif metadata from bytes blob."""
"""
with tempfile.NamedTemporaryFile() as temp_file: with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(blob) temp_file.write(blob)
@@ -382,10 +412,10 @@ class Media:
exif = et.get_metadata(temp_file.name) exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif) self.exif = json.dumps(exif)
@dataclass @dataclass
class Image(Media): class Image(Media):
"""Class for organizing information about an image file. """Class for organizing information about an image file."""
"""
#: Extracted OCR content from image #: Extracted OCR content from image
ocr: str = None ocr: str = None
@@ -403,135 +433,152 @@ class Image(Media):
self.hydrate_ocr(blob) self.hydrate_ocr(blob)
def hydrate_ocr(self, blob): def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob. """Extract OCR (optical character recognition) data from image bytes blob."""
"""
image = PIL.Image.open(io.BytesIO(blob)) image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image) self.ocr = pytesseract.image_to_string(image)
@dataclass @dataclass
class Video(Media): class Video(Media):
"""Class for organizing information about an video file. """Class for organizing information about an video file."""
"""
pass pass
@dataclass @dataclass
class Audio(Media): class Audio(Media):
"""Class for organizing information about an audio file. """Class for organizing information about an audio file."""
"""
pass pass
mapper_registry = registry() mapper_registry = registry()
raw_posts_table = Table('raw_posts', mapper_registry.metadata, raw_posts_table = Table(
Column('id', Integer, primary_key=True, "raw_posts",
autoincrement=True), mapper_registry.metadata,
Column('scraper', String), Column("id", Integer, primary_key=True, autoincrement=True),
Column('platform', String), Column("scraper", String),
Column('channel', Integer, ForeignKey('channels.id'), index=True), Column("platform", String),
Column('platform_id', String, index=True), Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column('date', DateTime, index=True), Column("platform_id", String, index=True),
Column('raw_data', String), Column("date", DateTime, index=True),
Column('date_archived', DateTime, index=True), Column("raw_data", String),
Column('archived_urls', JSON), Column("date_archived", DateTime, index=True),
Column('media_archived', DateTime, index=True)) Column("archived_urls", JSON),
Column("media_archived", DateTime, index=True),
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True))
channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('screenname', String),
Column('name', String),
Column('description', String),
Column('description_url', String),
Column('description_location', String),
Column('followers', Integer),
Column('following', Integer),
Column('verified', Boolean),
Column('date_created', DateTime),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
) )
channel_table = Table('channels', mapper_registry.metadata, raw_channel_info_table = Table(
Column('id', Integer, primary_key=True, autoincrement=True), "raw_channel_info",
Column('name', String), mapper_registry.metadata,
Column('platform_id', String), Column("id", Integer, primary_key=True),
Column('category', String), Column("scraper", String),
Column('platform', String), Column("platform", String),
Column('url', String), Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column('screenname', String), Column("raw_data", String),
Column('country', JSONB, index = True), Column("date_archived", DateTime, index=True),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
) )
post_table = Table('posts', mapper_registry.metadata, channel_info_table = Table(
Column('id', Integer, primary_key=True, "channel_info",
autoincrement=True), mapper_registry.metadata,
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True), Column("id", Integer, primary_key=True, autoincrement=True),
Column('platform_id', String, index=True), Column(
Column('scraper', String), "raw_channel_info_id", Integer, ForeignKey("raw_channel_info.id"), index=True
Column('transformer', String), ),
Column('platform', String), Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column('channel', Integer, ForeignKey('channels.id'), index=True), Column("platform_id", String),
Column('date', DateTime, index=True), Column("scraper", String),
Column('date_archived', DateTime, index=True), Column("transformer", String),
Column('date_transformed', DateTime, index=True), Column("platform", String),
Column('url', String), Column("screenname", String),
Column('author_id', String), Column("name", String),
Column('author_username', String), Column("description", String),
Column('content', String), Column("description_url", String),
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True), Column("description_location", String),
Column('reply_to', Integer, ForeignKey('posts.id'), index=True), Column("followers", Integer),
Column('named_entities', JSON), Column("following", Integer),
Column('cryptocurrency_addresses', JSON), Column("verified", Boolean),
Column('hashtags', JSON), Column("date_created", DateTime),
Column('outlinks', JSON), Column("date_archived", DateTime, index=True),
Column('mentions', JSON), Column("date_transformed", DateTime, index=True),
Column('likes', Integer),
Column('forwards', Integer),
Column('views', Integer),
Column('video_title', String),
Column('video_duration', Integer),
Column('detected_language', String, index = True),
Column('normalized_content', String)
) )
posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from) channel_table = Table(
"channels",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("name", String),
Column("platform_id", String),
Column("category", String),
Column("platform", String),
Column("url", String),
Column("screenname", String),
Column("country", JSONB, index=True),
Column("influencer", String),
Column("public", Boolean),
Column("chat", Boolean),
Column("notes", String),
Column("source", String),
)
media_table = Table('media', mapper_registry.metadata, post_table = Table(
Column('id', Integer, primary_key=True, "posts",
autoincrement=True), mapper_registry.metadata,
Column('type', String), Column("id", Integer, primary_key=True, autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True), Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
Column('post', Integer, ForeignKey('posts.id'), index=True), Column("platform_id", String, index=True),
Column('url', String), Column("scraper", String),
Column('original_url', String), Column("transformer", String),
Column('exif', String), Column("platform", String),
Column('ocr', String), Column("channel", Integer, ForeignKey("channels.id"), index=True),
Column('date', DateTime, index=True), Column("date", DateTime, index=True),
Column('date_archived', DateTime, index=True), Column("date_archived", DateTime, index=True),
Column('date_transformed', DateTime, index=True), Column("date_transformed", DateTime, index=True),
Column('scraper', String), Column("url", String),
Column('transformer', String) Column("author_id", String),
Column("author_username", String),
Column("content", String),
Column("forwarded_from", Integer, ForeignKey("channels.id"), index=True),
Column("reply_to", Integer, ForeignKey("posts.id"), index=True),
Column("named_entities", JSON),
Column("cryptocurrency_addresses", JSON),
Column("hashtags", JSON),
Column("outlinks", JSON),
Column("mentions", JSON),
Column("likes", Integer),
Column("forwards", Integer),
Column("views", Integer),
Column("video_title", String),
Column("video_duration", Integer),
Column("detected_language", String, index=True),
Column("normalized_content", String),
)
posts_forwarded_from_channel_index = Index(
"posts_channel_forwarded_from_idx",
post_table.c.channel,
post_table.c.forwarded_from,
)
media_table = Table(
"media",
mapper_registry.metadata,
Column("id", Integer, primary_key=True, autoincrement=True),
Column("type", String),
Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
Column("post", Integer, ForeignKey("posts.id"), index=True),
Column("url", String),
Column("original_url", String),
Column("exif", String),
Column("ocr", String),
Column("date", DateTime, index=True),
Column("date_archived", DateTime, index=True),
Column("date_transformed", DateTime, index=True),
Column("scraper", String),
Column("transformer", String),
) )
mapper_registry.map_imperatively(Post, post_table) mapper_registry.map_imperatively(Post, post_table)
@@ -539,7 +586,27 @@ mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table) mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table) mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(ChannelInfo, channel_info_table) mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') mapper_registry.map_imperatively(
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') Media, media_table, polymorphic_on="type", polymorphic_identity="media"
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') )
mapper_registry.map_imperatively(Audio, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='audio') mapper_registry.map_imperatively(
Image,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="image",
)
mapper_registry.map_imperatively(
Video,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="video",
)
mapper_registry.map_imperatively(
Audio,
media_table,
inherits=Media,
polymorphic_on="type",
polymorphic_identity="audio",
)

View File

@@ -18,6 +18,7 @@ from sqlalchemy import nullsfirst
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
from cisticola.utils import make_request from cisticola.utils import make_request
class Scraper: class Scraper:
"""Base class for defining platform-specific scrapers for scraping all posts """Base class for defining platform-specific scrapers for scraping all posts
from a given channel on that specific platform. from a given channel on that specific platform.
@@ -25,23 +26,26 @@ class Scraper:
__version__ = "Scraper 0.0.0" __version__ = "Scraper 0.0.0"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t') cookiestring = (
cookiefilename = 'cookiefile.txt' os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
)
cookiefilename = "cookiefile.txt"
def __init__(self): def __init__(self):
# Initialize client to transfer files to the storage archive # Initialize client to transfer files to the storage archive
self.s3_client = boto3.client( self.s3_client = boto3.client(
service_name='s3', service_name="s3",
region_name=os.environ['DO_SPACES_REGION'], region_name=os.environ["DO_SPACES_REGION"],
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com', endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
aws_access_key_id=os.environ['DO_SPACES_KEY'], aws_access_key_id=os.environ["DO_SPACES_KEY"],
aws_secret_access_key=os.environ['DO_SPACES_SECRET']) aws_secret_access_key=os.environ["DO_SPACES_SECRET"],
)
# Define request headers (necessary to bypass scraping protection # Define request headers (necessary to bypass scraping protection
# for several platform scrapers) # for several platform scrapers)
self.headers = { self.headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'} "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
def __str__(self): def __str__(self):
return self.__version__ return self.__version__
@@ -83,7 +87,7 @@ class Scraper:
the original post URL and the media's Content-Type. the original post URL and the media's Content-Type.
""" """
key = urlparse(url).path.split('/')[-1] key = urlparse(url).path.split("/")[-1]
return key return key
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
@@ -111,7 +115,7 @@ class Scraper:
r = make_request(url, headers=self.headers) r = make_request(url, headers=self.headers)
blob = r.content blob = r.content
content_type = r.headers.get('Content-Type') content_type = r.headers.get("Content-Type")
if key is None: if key is None:
key = self.url_to_key(url, content_type) key = self.url_to_key(url, content_type)
@@ -141,17 +145,16 @@ class Scraper:
Unique identifier for the media file. Unique identifier for the media file.
""" """
content_type = 'video/mp4' content_type = "video/mp4"
ext = '.' + content_type.split('/')[-1] ext = "." + content_type.split("/")[-1]
with tempfile.NamedTemporaryFile(suffix=ext) as temp_file: with tempfile.NamedTemporaryFile(suffix=ext) as temp_file:
( (
ffmpeg ffmpeg.input(url)
.input(url) .output(temp_file.name, vcodec="copy")
.output(temp_file.name, vcodec='copy') .global_args("-loglevel", "error")
.global_args('-loglevel', 'error') .run(overwrite_output=True)
.run(overwrite_output=True)) )
temp_file.seek(0) temp_file.seek(0)
blob = temp_file.read() blob = temp_file.read()
@@ -184,11 +187,11 @@ class Scraper:
Unique identifier for the media file. Unique identifier for the media file.
""" """
content_type = 'video/mp4' content_type = "video/mp4"
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
cookiefile = Path(temp_dir) / self.cookiefilename cookiefile = Path(temp_dir) / self.cookiefilename
with open(cookiefile, 'w') as f: with open(cookiefile, "w") as f:
f.write(self.cookiestring) f.write(self.cookiestring)
ydl_opts = { ydl_opts = {
@@ -199,14 +202,16 @@ class Scraper:
"quiet": True, "quiet": True,
"verbose": False, "verbose": False,
"retries": 5, "retries": 5,
"cookiefile": cookiefile} "cookiefile": cookiefile,
}
ydl = yt_dlp.YoutubeDL(ydl_opts) ydl = yt_dlp.YoutubeDL(ydl_opts)
try: try:
meta = ydl.extract_info( meta = ydl.extract_info(
url, url,
download=True,) download=True,
)
except yt_dlp.utils.DownloadError as e: except yt_dlp.utils.DownloadError as e:
raise e raise e
else: else:
@@ -240,12 +245,16 @@ class Scraper:
URL specifying the file on the storage archive. URL specifying the file on the storage archive.
""" """
filename = self.__version__.replace(' ', '_') + '/' + key filename = self.__version__.replace(" ", "_") + "/" + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[ self.s3_client.upload_fileobj(
'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type}) BytesIO(blob),
Bucket=os.environ["DO_BUCKET"],
Key=filename,
ExtraArgs={"ACL": "public-read", "ContentType": content_type},
)
archived_url = os.environ['DO_URL'] + '/' + filename archived_url = os.environ["DO_URL"] + "/" + filename
return archived_url return archived_url
@@ -292,7 +301,9 @@ class Scraper:
raise NotImplementedError raise NotImplementedError
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel. """Scrape all posts from the specified Channel.
Parameters Parameters
@@ -342,8 +353,7 @@ class ScraperController:
self.scrapers.extend(scrapers) self.scrapers.extend(scrapers)
def remove_all_scrapers(self): def remove_all_scrapers(self):
"""Reset the ScraperController so that it doesn't control any scrapers """Reset the ScraperController so that it doesn't control any scrapers"""
"""
self.scrapers = [] self.scrapers = []
def scrape_all_channels(self, fetch_old: bool = False): def scrape_all_channels(self, fetch_old: bool = False):
@@ -362,15 +372,23 @@ class ScraperController:
session = self.session() session = self.session()
# TODO there should be a better/more generic way of selecting scrapeable channels # TODO there should be a better/more generic way of selecting scrapeable channels
channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).all() channels = (
session.query(Channel)
.filter(
(Channel.source == "researcher")
| (Channel.source == "snowball_it")
| (Channel.source == "snowball_complete")
| (Channel.source == "linked_channel")
)
.all()
)
session.close() session.close()
return self.scrape_channels(channels, fetch_old=fetch_old) return self.scrape_channels(channels, fetch_old=fetch_old)
def scrape_all_channel_info(self): def scrape_all_channel_info(self):
"""Scrape profile information from all channels in the database. """Scrape profile information from all channels in the database."""
"""
if self.session is None: if self.session is None:
logger.error("No DB session") logger.error("No DB session")
return return
@@ -379,11 +397,28 @@ class ScraperController:
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels. # Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
# This will sort the channels by the least recently scraped. # This will sort the channels by the least recently scraped.
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery() most_recently_archived = (
channels = session.query(Channel).\ session.query(
filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).\ func.max(RawChannelInfo.date_archived).label("date"),
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\ RawChannelInfo.channel.label("channel"),
order_by(nullsfirst(most_recently_archived.c.date.asc())).all() )
.group_by(RawChannelInfo.channel)
.subquery()
)
channels = (
session.query(Channel)
.filter(
(Channel.source == "researcher")
| (Channel.source == "snowball_it")
| (Channel.source == "snowball_complete")
| (Channel.source == "linked_channel")
)
.outerjoin(
most_recently_archived, Channel.id == most_recently_archived.c.channel
)
.order_by(nullsfirst(most_recently_archived.c.date.asc()))
.all()
)
session.close() session.close()
return self.scrape_channel_info(channels) return self.scrape_channel_info(channels)
@@ -408,12 +443,17 @@ class ScraperController:
# If any channels are not already in the database, add them # If any channels are not already in the database, add them
for channel in channels: for channel in channels:
platform_id = None platform_id = None
if channel.platform_id not in (None, ''): if channel.platform_id not in (None, ""):
platform_id = channel.platform_id platform_id = channel.platform_id
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first() channel_in_db = (
session.query(Channel)
.filter_by(
platform_id=platform_id, platform=channel.platform, url=channel.url
)
.first()
)
if not channel_in_db: if not channel_in_db:
logger.debug(f"{channel} does not exist in database, adding") logger.debug(f"{channel} does not exist in database, adding")
@@ -429,13 +469,17 @@ class ScraperController:
handled = True handled = True
added = 0 added = 0
if fetch_old and channel.platform == 'Telegram': if fetch_old and channel.platform == "Telegram":
# get oldest post (currently only for Telegram) # get oldest post (currently only for Telegram)
# TODO fix this so that it doesn't have an explicit check on channel.platform (should be generic) # TODO fix this so that it doesn't have an explicit check on channel.platform (should be generic)
# TODO implement until on all scrapers # TODO implement until on all scrapers
rows = session.query(ScraperResult).where( rows = (
ScraperResult.channel == channel.id).order_by( session.query(ScraperResult)
ScraperResult.date.asc(), ScraperResult.id.desc()).limit(10).all() .where(ScraperResult.channel == channel.id)
.order_by(ScraperResult.date.asc(), ScraperResult.id.desc())
.limit(10)
.all()
)
if len(rows) > 0: if len(rows) > 0:
until = rows[0] until = rows[0]
@@ -449,9 +493,13 @@ class ScraperController:
# Note: a "bug" in Postgres can cause this query to hang for a really long time # Note: a "bug" in Postgres can cause this query to hang for a really long time
# when searching for a single row, hence the limit(10).all() when we really just need # when searching for a single row, hence the limit(10).all() when we really just need
# the first row. # the first row.
rows = session.query(ScraperResult).where( rows = (
ScraperResult.channel == channel.id).order_by( session.query(ScraperResult)
ScraperResult.date.desc(), ScraperResult.id.asc()).limit(10).all() .where(ScraperResult.channel == channel.id)
.order_by(ScraperResult.date.desc(), ScraperResult.id.asc())
.limit(10)
.all()
)
if len(rows) > 0: if len(rows) > 0:
since = rows[0] since = rows[0]
@@ -466,8 +514,7 @@ class ScraperController:
added += 1 added += 1
session.commit() session.commit()
logger.info( logger.info(f"{scraper} found {added} new posts from {channel}")
f"{scraper} found {added} new posts from {channel}")
break break
if not handled: if not handled:
@@ -489,11 +536,24 @@ class ScraperController:
if session is None: if session is None:
session = self.session() session = self.session()
if chronological: if chronological:
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all() posts = (
session.query(ScraperResult)
.where(ScraperResult.media_archived == None)
.where(ScraperResult.id >= 0)
.order_by(ScraperResult.date.desc())
.limit(5000)
.all()
)
else: else:
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work # this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
# simultaneously with low risk of collision (at least while the number of unarchived items is very large) # simultaneously with low risk of collision (at least while the number of unarchived items is very large)
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all() posts = (
session.query(ScraperResult)
.where(ScraperResult.media_archived == None)
.order_by(func.random())
.limit(5000)
.all()
)
logger.info(f"Found {len(posts)} posts without media. Archiving now") logger.info(f"Found {len(posts)} posts without media. Archiving now")
@@ -502,13 +562,23 @@ class ScraperController:
for scraper in self.scrapers: for scraper in self.scrapers:
# compare major versions # compare major versions
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]: if (
post.scraper is not None
and scraper.__version__.split(".")[0] == post.scraper.split(".")[0]
):
handled = True handled = True
logger.debug(f"{scraper} is archiving media for ID {post.id}") logger.debug(f"{scraper} is archiving media for ID {post.id}")
post = scraper.archive_files(post) post = scraper.archive_files(post)
if post: if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived}) session.query(ScraperResult).where(
ScraperResult.id == post.id
).update(
{
"archived_urls": post.archived_urls,
"media_archived": post.media_archived,
}
)
session.commit() session.commit()
break break
@@ -535,7 +605,9 @@ class ScraperController:
session = self.session() session = self.session()
while True: while True:
self.archive_unarchived_media_batch(self, session=session, chronological=chronological) self.archive_unarchived_media_batch(
self, session=session, chronological=chronological
)
@logger.catch(reraise=True) @logger.catch(reraise=True)
def scrape_channel_info(self, channels: List[Channel]): def scrape_channel_info(self, channels: List[Channel]):
@@ -571,8 +643,7 @@ class ScraperController:
session.add(info) session.add(info)
session.commit() session.commit()
logger.info( logger.info(f"{scraper} found {info}")
f"{scraper} found {info}")
break break
except ChannelDoesNotExistError: except ChannelDoesNotExistError:
logger.warning(f"ChannelDoesNotExist {channel}") logger.warning(f"ChannelDoesNotExist {channel}")
@@ -599,13 +670,13 @@ class ScraperController:
self.session.configure(bind=self.engine) self.session.configure(bind=self.engine)
def reset_db(self): def reset_db(self):
"""Drop all data from the connected SQLAlchemy database. """Drop all data from the connected SQLAlchemy database."""
"""
close_all_sessions() close_all_sessions()
mapper_registry.metadata.drop_all(bind=self.engine) mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine) self.connect_to_db(self.engine)
class ChannelDoesNotExistError(Exception): class ChannelDoesNotExistError(Exception):
"""The specified channel does not exist or has been deleted.""" """The specified channel does not exist or has been deleted."""

View File

@@ -14,105 +14,129 @@ from loguru import logger
from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class BitchuteScraper(Scraper): class BitchuteScraper(Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat """An implementation of a Scraper for Bitchute, using classes from the 4cat
library""" library"""
__version__ = "BitchuteScraper 0.0.1" __version__ = "BitchuteScraper 0.0.1"
def get_username_from_url(self, url): def get_username_from_url(self, url):
username = url.split('bitchute.com/channel/')[-1].strip('/') username = url.split("bitchute.com/channel/")[-1].strip("/")
return username return username
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
session = requests.Session() session = requests.Session()
session.headers.update(self.headers) session.headers.update(self.headers)
request = session.get("https://www.bitchute.com/search") request = session.get("https://www.bitchute.com/search")
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll( csrftoken = (
"input", {"name": "csrfmiddlewaretoken"})[0].get("value") BeautifulSoup(request.text, "html.parser")
.findAll("input", {"name": "csrfmiddlewaretoken"})[0]
.get("value")
)
time.sleep(0.25) time.sleep(0.25)
detail = 'comments' detail = "comments"
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail) scraper = get_videos_user(session, username, csrftoken, detail)
for post in scraper: for post in scraper:
if (
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date: since is not None
and datetime.fromtimestamp(post["timestamp"]) <= since.date
):
break break
archived_urls = {} archived_urls = {}
if 'video_url' in post: if "video_url" in post:
url = post['video_url'] url = post["video_url"]
archived_urls[url] = None archived_urls[url] = None
yield ScraperResult( yield ScraperResult(
scraper=self.__version__, scraper=self.__version__,
platform="Bitchute", platform="Bitchute",
channel=channel.id, channel=channel.id,
platform_id=post['id'], platform_id=post["id"],
date=datetime.fromtimestamp(post['timestamp']), date=datetime.fromtimestamp(post["timestamp"]),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post), raw_data=json.dumps(post),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=None) media_archived=None,
)
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: if (
channel.platform == "Bitchute"
and self.get_username_from_url(channel.url) is not None
):
return True return True
@logger.catch @logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo: def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url base_url = channel.url
session = requests.session() session = requests.session()
response = session.get(base_url) response = session.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, "html.parser")
canonical_url = soup.find('link', {'id' : 'canonical'})['href'] canonical_url = soup.find("link", {"id": "canonical"})["href"]
csrftoken = session.cookies['csrftoken'] csrftoken = session.cookies["csrftoken"]
csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value'] csrfmiddlewaretoken = soup.find("input", {"name": "csrfmiddlewaretoken"})[
"value"
]
about_soup = soup.find('div', {'id' : 'channel-about'}) about_soup = soup.find("div", {"id": "channel-about"})
info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p') info_list = about_soup.find("div", {"class": "channel-about-details"}).find_all(
description_soup = about_soup.find('div', {'id' : 'channel-description'}) "p"
)
description_soup = about_soup.find("div", {"id": "channel-description"})
headers = {'Referer': base_url} headers = {"Referer": base_url}
data = { data = {"csrftoken": csrftoken, "csrfmiddlewaretoken": csrfmiddlewaretoken}
'csrftoken': csrftoken,
'csrfmiddlewaretoken': csrfmiddlewaretoken}
response = session.post(canonical_url + 'counts/', data = data, headers = headers) response = session.post(canonical_url + "counts/", data=data, headers=headers)
counts = json.loads(response.text) counts = json.loads(response.text)
owner_soup = soup.find('p', {'class' : 'owner'}) owner_soup = soup.find("p", {"class": "owner"})
if owner_soup.text == '[email\xa0protected]': if owner_soup.text == "[email\xa0protected]":
owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail']) owner_name = decode_cfemail(
owner_soup.find("span", {"class": "__cf_email__"})["data-cfemail"]
)
else: else:
owner_name = owner_soup.text owner_name = owner_soup.text
profile = { profile = {
'description' : description_soup.text.strip(), "description": description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], "description_links": [
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), a["href"] for a in description_soup.find_all("a", href=True)
'videos' : int(info_list[1].text.split('videos')[0].strip()), ],
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], "created": re.sub(
'owner_name' : owner_name, r"\s", " ", info_list[0].text.split("Created")[1].strip(". ")
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'), ),
'subscribers': counts['subscriber_count'], "videos": int(info_list[1].text.split("videos")[0].strip()),
'views': int(counts['about_view_count'].split(' ')[0])} "owner_url": soup.find("p", {"class": "owner"}).find("a", href=True)[
"href"
],
"owner_name": owner_name,
"image": about_soup.find("img", {"alt": "Channel Image"}).get("data-src"),
"subscribers": counts["subscriber_count"],
"views": int(counts["about_view_count"].split(" ")[0]),
}
return RawChannelInfo(
return RawChannelInfo(scraper=self.__version__, scraper=self.__version__,
platform=channel.platform, platform=channel.platform,
channel=channel.id, channel=channel.id,
raw_data=json.dumps(profile, default=str), raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc)) date_archived=datetime.now(timezone.utc),
)
def strip_tags(html, convert_newlines=True): def strip_tags(html, convert_newlines=True):
r""" r"""
@@ -149,6 +173,7 @@ def strip_tags(html, convert_newlines=True):
stripper.feed(html) stripper.feed(html)
return stripper.get_data() return stripper.get_data()
def request_from_bitchute(session, method, url, headers=None, data=None): def request_from_bitchute(session, method, url, headers=None, data=None):
""" """
Request something via the BitChute API (or non-API) Request something via the BitChute API (or non-API)
@@ -176,7 +201,10 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
raise NotImplemented() raise NotImplemented()
if request.status_code >= 300: if request.status_code >= 300:
raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url)) raise ValueError(
"Response %i from BitChute for URL %s, need to retry"
% (request.status_code, url)
)
response = request.json() response = request.json()
return response return response
@@ -193,6 +221,7 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
return response return response
def append_details(video, detail): def append_details(video, detail):
""" """
Append extra metadata to video data Append extra metadata to video data
@@ -214,7 +243,7 @@ def append_details(video, detail):
"comments": "", "comments": "",
"hashtags": "", "hashtags": "",
"parent_id": "", "parent_id": "",
"video_url": "" "video_url": "",
} }
try: try:
@@ -223,15 +252,23 @@ def append_details(video, detail):
video_session = requests.session() video_session = requests.session()
video_page = video_session.get(video["url"]) video_page = video_session.get(video["url"])
if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \ if (
"<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \ '<h1 class="page-title">Video Restricted</h1>' in video_page.text
"<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \ or '<h1 class="page-title">Video Blocked</h1>' in video_page.text
"<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text: or '<h1 class="page-title">Channel Blocked</h1>' in video_page.text
if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text: or '<h1 class="page-title">Channel Restricted</h1>' in video_page.text
):
if (
"This video is unavailable as the contents have been deemed potentially illegal"
in video_page.text
):
video["category"] = "moderated-illegal" video["category"] = "moderated-illegal"
return (video, []) return (video, [])
elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text: elif (
"Viewing of this video is restricted, as it has been marked as Not Safe For Life"
in video_page.text
):
video["category"] = "moderated-nsfl" video["category"] = "moderated-nsfl"
return (video, []) return (video, [])
@@ -255,39 +292,47 @@ def append_details(video, detail):
video["category"] = "moderated-other" video["category"] = "moderated-other"
return (video, []) return (video, [])
elif "<iframe class=\"rumble\"" in video_page.text: elif '<iframe class="rumble"' in video_page.text:
# some videos are actually embeds from rumble? # some videos are actually embeds from rumble?
# these are iframes, so at the moment we cannot simply extract # these are iframes, so at the moment we cannot simply extract
# their info from the page, so we skip them. In the future we # their info from the page, so we skip them. In the future we
# could add an extra request to get the relevant info, but so # could add an extra request to get the relevant info, but so
# far the only examples I've seen are actually 'video not found' # far the only examples I've seen are actually 'video not found'
video = { video = {**video, "category": "error-embed-from-rumble"}
**video,
"category": "error-embed-from-rumble"
}
return (video, []) return (video, [])
elif video_page.status_code != 200: elif video_page.status_code != 200:
video = { video = {**video, "category": "error-%i" % video_page.status_code}
**video,
"category": "error-%i" % video_page.status_code
}
return (video, []) return (video, [])
soup = BeautifulSoup(video_page.text, 'html.parser') soup = BeautifulSoup(video_page.text, "html.parser")
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value") video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
"value"
)
video["video_url"] = soup.select_one("video#player source").get("src") video["video_url"] = soup.select_one("video#player source").get("src")
video["thumbnail_image"] = soup.select_one("video#player").get("poster") video["thumbnail_image"] = soup.select_one("video#player").get("poster")
video["subject"] = soup.select_one("h1#video-title").text video["subject"] = soup.select_one("h1#video-title").text
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2] video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] video["author"] = (
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
)
video["body"] = (
soup.select_one("div#video-description")
.encode_contents()
.decode("utf-8")
.strip()
)
# we need *two more requests* to get the comment count and like/dislike counts # we need *two more requests* to get the comment count and like/dislike counts
# this seems to be because bitchute uses a third-party comment widget # this seems to be because bitchute uses a third-party comment widget
video_session.headers = {'Referer': video["url"], 'Origin': video["url"]} video_session.headers = {"Referer": video["url"], "Origin": video["url"]}
counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken}) counts = request_from_bitchute(
video_session,
"POST",
"https://www.bitchute.com/video/%s/counts/" % video["id"],
data={"csrfmiddlewaretoken": video_csfrtoken},
)
if detail == "comments": if detail == "comments":
# if comments are also to be scraped, this is anothe request to make, which returns # if comments are also to be scraped, this is anothe request to make, which returns
@@ -308,7 +353,12 @@ def append_details(video, detail):
comment_count = 0 comment_count = 0
url = comment_script.split("'")[1] url = comment_script.split("'")[1]
comment_csrf = comment_script.split("'")[3] comment_csrf = comment_script.split("'")[3]
comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0}) comments_data = request_from_bitchute(
video_session,
"POST",
url + "/api/get_comments/",
data={"cf_auth": comment_csrf, "commentCount": 0},
)
for comment in comments_data: for comment in comments_data:
comment_count += 1 comment_count += 1
@@ -318,14 +368,17 @@ def append_details(video, detail):
else: else:
thumbnail_image = "" thumbnail_image = ""
comments.append({ comments.append(
{
"id": comment["id"], "id": comment["id"],
"thread_id": video["id"], "thread_id": video["id"],
"subject": "", "subject": "",
"body": comment["content"], "body": comment["content"],
"author": comment["fullname"], "author": comment["fullname"],
"author_id": comment["creator"], "author_id": comment["creator"],
"timestamp": int(dateparser.parse(comment["created"]).timestamp()), "timestamp": int(
dateparser.parse(comment["created"]).timestamp()
),
"url": "", "url": "",
"views": "", "views": "",
"length": "", "length": "",
@@ -336,16 +389,24 @@ def append_details(video, detail):
"dislikes": "", "dislikes": "",
"channel_subscribers": "", "channel_subscribers": "",
"comments": "", "comments": "",
"parent_id": comment.get("parent", "") if "parent" in comment else video["id"], "parent_id": comment.get("parent", "")
}) if "parent" in comment
else video["id"],
}
)
else: else:
# if we don't need the full comments, we still need another request to get the *amount* # if we don't need the full comments, we still need another request to get the *amount*
# of comments, # of comments,
comment_count = request_from_bitchute(video_session, "POST", comment_count = request_from_bitchute(
video_session,
"POST",
"https://commentfreely.bitchute.com/api/get_comment_count/", "https://commentfreely.bitchute.com/api/get_comment_count/",
data={"csrfmiddlewaretoken": video_csfrtoken, data={
"cf_thread": "bc_" + video["id"]})["commentCount"] "csrfmiddlewaretoken": video_csfrtoken,
"cf_thread": "bc_" + video["id"],
},
)["commentCount"]
except RuntimeError as e: except RuntimeError as e:
# we wrap this in one big try-catch because doing it for each request separarely is tedious # we wrap this in one big try-catch because doing it for each request separarely is tedious
@@ -358,7 +419,10 @@ def append_details(video, detail):
# exact day it was uploaded # exact day it was uploaded
try: try:
published = dateparser.parse( published = dateparser.parse(
soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1]) soup.find(class_="video-publish-date")
.text.split("published at")[1]
.strip()[:-1]
)
except AttributeError as e: except AttributeError as e:
# publication date not on page? # publication date not on page?
published = None published = None
@@ -373,7 +437,7 @@ def append_details(video, detail):
"comments": comment_count, "comments": comment_count,
"parent_id": "", "parent_id": "",
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]), "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
"views": counts["view_count"] "views": counts["view_count"],
} }
if published: if published:
@@ -383,6 +447,7 @@ def append_details(video, detail):
time.sleep(0.25) time.sleep(0.25)
return (video, comments) return (video, comments)
def get_videos_user(session, user, csrftoken, detail): def get_videos_user(session, user, csrftoken, detail):
""" """
Scrape videos for given BitChute user Scrape videos for given BitChute user
@@ -402,23 +467,27 @@ def get_videos_user(session, user, csrftoken, detail):
url = base_url + "extend/" url = base_url + "extend/"
container = session.get(base_url) container = session.get(base_url)
container_soup = BeautifulSoup(container.text, 'html.parser') container_soup = BeautifulSoup(container.text, "html.parser")
headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"} headers = {"Referer": base_url, "Origin": "https://www.bitchute.com/"}
while True: while True:
post_data = {
"csrfmiddlewaretoken": csrftoken,
"name": "",
"offset": str(offset),
}
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)} response = request_from_bitchute(
session, "POST", url, headers=headers, data=post_data
)
response = request_from_bitchute(session, "POST", url, headers=headers, data=post_data) soup = BeautifulSoup(response["html"], "html.parser")
soup = BeautifulSoup(response["html"], 'html.parser')
videos = soup.select(".channel-videos-container") videos = soup.select(".channel-videos-container")
comments = [] comments = []
if len(videos) == 0 or num_items >= max_items: if len(videos) == 0 or num_items >= max_items:
break break
for video_element in videos: for video_element in videos:
if num_items >= max_items: if num_items >= max_items:
break break
@@ -432,16 +501,26 @@ def get_videos_user(session, user, csrftoken, detail):
"id": link["href"].split("/")[-2], "id": link["href"].split("/")[-2],
"thread_id": link["href"].split("/")[-2], "thread_id": link["href"].split("/")[-2],
"subject": link.text, "subject": link.text,
"body": strip_tags(video_element.select_one(".channel-videos-text").text), "body": strip_tags(
video_element.select_one(".channel-videos-text").text
),
"author": container_soup.select_one(".details .name a").text, "author": container_soup.select_one(".details .name a").text,
"author_id": container_soup.select_one(".details .name a")["href"].split("/")[2], "author_id": container_soup.select_one(".details .name a")[
"href"
].split("/")[2],
"timestamp": int( "timestamp": int(
dateparser.parse( dateparser.parse(
video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()), video_element.select_one(
".channel-videos-details.text-right.hidden-xs"
).text
).timestamp()
),
"url": "https://www.bitchute.com" + link["href"], "url": "https://www.bitchute.com" + link["href"],
"views": video_element.select_one(".video-views").text.strip(), "views": video_element.select_one(".video-views").text.strip(),
"length": video_element.select_one(".video-duration").text.strip(), "length": video_element.select_one(".video-duration").text.strip(),
"thumbnail_image": video_element.select_one(".channel-videos-image img")["src"], "thumbnail_image": video_element.select_one(
".channel-videos-image img"
)["src"],
} }
if detail != "basic": if detail != "basic":
@@ -456,10 +535,9 @@ def get_videos_user(session, user, csrftoken, detail):
# before the video, which is weird # before the video, which is weird
yield comment yield comment
def decode_cfemail(cfemail):
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email def decode_cfemail(cfemail):
""" """https://stackoverflow.com/questions/36911296/scraping-of-protected-email"""
email = "" email = ""
k = int(cfemail[:2], 16) k = int(cfemail[:2], 16)

View File

@@ -9,8 +9,10 @@ from gogettr import PublicClient
from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
class GettrScraper(Scraper): class GettrScraper(Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library""" """An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1" __version__ = "GettrScraper 0.0.1"
def get_username_from_url(self, url): def get_username_from_url(self, url):
@@ -21,48 +23,57 @@ class GettrScraper(Scraper):
return username return username
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
client = PublicClient() client = PublicClient()
username = self.get_username_from_url(channel.url).lower() username = self.get_username_from_url(channel.url).lower()
scraper = client.user_activity(username=username, type="posts") scraper = client.user_activity(username=username, type="posts")
for post in scraper: for post in scraper:
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date: if (
since is not None
and datetime.fromtimestamp(post["cdate"] * 0.001) <= since.date
):
break break
archived_urls = {} archived_urls = {}
if 'imgs' in post: if "imgs" in post:
for img in post['imgs']: for img in post["imgs"]:
url = "https://media.gettr.com/" + img url = "https://media.gettr.com/" + img
archived_urls[url] = None archived_urls[url] = None
if 'main' in post: if "main" in post:
url = "https://media.gettr.com/" + post['main'] url = "https://media.gettr.com/" + post["main"]
archived_urls[url] = None archived_urls[url] = None
if 'ovid' in post: if "ovid" in post:
url = "https://media.gettr.com/" + post['ovid'] url = "https://media.gettr.com/" + post["ovid"]
archived_urls[url] = None archived_urls[url] = None
yield ScraperResult( yield ScraperResult(
scraper=self.__version__, scraper=self.__version__,
platform="Gettr", platform="Gettr",
channel=channel.id, channel=channel.id,
platform_id=post['_id'], platform_id=post["_id"],
date=datetime.fromtimestamp(post['cdate']/1000.), date=datetime.fromtimestamp(post["cdate"] / 1000.0),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post), raw_data=json.dumps(post),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=None) media_archived=None,
)
def can_handle(self, channel): def can_handle(self, channel):
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None: if (
channel.platform == "Gettr"
and self.get_username_from_url(channel.url) is not None
):
return True return True
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1] ext = "." + content_type.split("/")[-1]
key = urlparse(url).path.split('/')[-2] + ext key = urlparse(url).path.split("/")[-2] + ext
return key return key
@logger.catch @logger.catch
@@ -71,8 +82,10 @@ class GettrScraper(Scraper):
username = self.get_username_from_url(channel.url) username = self.get_username_from_url(channel.url)
profile = client.user_info(username) profile = client.user_info(username)
return RawChannelInfo(scraper=self.__version__, return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform, platform=channel.platform,
channel=channel.id, channel=channel.id,
raw_data=json.dumps(profile), raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc)) date_archived=datetime.now(timezone.utc),
)

View File

@@ -10,25 +10,32 @@ import os
from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper import Scraper, make_request from cisticola.scraper import Scraper, make_request
BASE_URL = 'https://rumble.com' BASE_URL = "https://rumble.com"
class RumbleScraper(Scraper): class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions""" """An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.2" __version__ = "RumbleScraper 0.0.2"
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t') cookiestring = (
cookiefilename = 'cookiefile.txt' os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
)
cookiefilename = "cookiefile.txt"
@logger.catch @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]: def get_posts(
self, channel: Channel, since: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
scraper = get_channel_videos(channel.url) scraper = get_channel_videos(channel.url)
for post in scraper: for post in scraper:
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): if since is not None and post["datetime"].replace(
tzinfo=timezone.utc
) <= since.date.replace(tzinfo=timezone.utc):
break break
url = post['media_url'] url = post["media_url"]
archived_urls = {url: None} archived_urls = {url: None}
@@ -36,16 +43,17 @@ class RumbleScraper(Scraper):
scraper=self.__version__, scraper=self.__version__,
platform="Rumble", platform="Rumble",
channel=channel.id, channel=channel.id,
platform_id=post['media_url'].split('/')[-2], platform_id=post["media_url"].split("/")[-2],
date=post['datetime'].replace(tzinfo=timezone.utc), date=post["datetime"].replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post, default=str), raw_data=json.dumps(post, default=str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=None) media_archived=None,
)
def url_to_key(self, url: str, content_type: str) -> str: def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1] ext = "." + content_type.split("/")[-1]
key = urlparse(url).path.split('/')[-2] + ext key = urlparse(url).path.split("/")[-2] + ext
return key return key
@logger.catch @logger.catch
@@ -65,74 +73,77 @@ class RumbleScraper(Scraper):
@logger.catch @logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo: def get_profile(self, channel: Channel) -> RawChannelInfo:
profile = get_channel_profile(url=channel.url) profile = get_channel_profile(url=channel.url)
return RawChannelInfo(scraper=self.__version__, return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform, platform=channel.platform,
channel=channel.id, channel=channel.id,
raw_data=json.dumps(profile), raw_data=json.dumps(profile),
date_archived=datetime.now(timezone.utc)) date_archived=datetime.now(timezone.utc),
)
def get_media_url(url): def get_media_url(url):
r = make_request(url=url) r = make_request(url=url)
soup = BeautifulSoup(r.content, features = 'html.parser') soup = BeautifulSoup(r.content, features="html.parser")
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text)) script = json.loads(
media_url = script[0]['embedUrl'] "".join(soup.find("script", {"type": "application/ld+json"}).text)
)
media_url = script[0]["embedUrl"]
return media_url return media_url
def process_video(video): def process_video(video):
rumble_soup = video.find("span", {"class": "video-item--rumbles"})
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
if rumble_soup is None: if rumble_soup is None:
rumbles = '0' rumbles = "0"
else: else:
rumbles = rumble_soup['data-value'] rumbles = rumble_soup["data-value"]
view_span = video.find('span', {'class' : 'video-item--views'}) view_span = video.find("span", {"class": "video-item--views"})
if view_span is None: if view_span is None:
views = None views = None
else: else:
views = view_span.get('data-value') views = view_span.get("data-value")
author_a = video.find('a', {'rel': 'author'}) author_a = video.find("a", {"rel": "author"})
if author_a is None: if author_a is None:
author_id = None author_id = None
author_name = None author_name = None
else: else:
author_id = author_a['href'].split('/')[-1] author_id = author_a["href"].split("/")[-1]
author_name = author_a.text author_name = author_a.text
video_link = BASE_URL + video.find('a', href = True)['href'] video_link = BASE_URL + video.find("a", href=True)["href"]
r = make_request(url=video_link) r = make_request(url=video_link)
soup = BeautifulSoup(r.content, features = 'html.parser') soup = BeautifulSoup(r.content, features="html.parser")
content_div = soup.find('div', {'class': 'container content media-description'}) content_div = soup.find("div", {"class": "container content media-description"})
info = { info = {
'title' : video.find('h3').text, "title": video.find("h3").text,
'thumbnail' : video.find('img')['src'], "thumbnail": video.find("img")["src"],
'link' : video_link, "link": video_link,
'views' : views, "views": views,
'rumbles' : rumbles, "rumbles": rumbles,
'content': '' if content_div is None else content_div.get_text('\n'), "content": "" if content_div is None else content_div.get_text("\n"),
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'], "duration": video.find("span", {"class": "video-item--duration"})["data-value"],
'datetime' : datetime.fromisoformat(video.find('time')['datetime']), "datetime": datetime.fromisoformat(video.find("time")["datetime"]),
'author_id': author_id, "author_id": author_id,
'author_name': author_name} "author_name": author_name,
}
info['media_url'] = get_media_url(info['link']) info["media_url"] = get_media_url(info["link"])
return info return info
def get_channel_videos(url): def get_channel_videos(url):
page = 1 page = 1
channel_url = f'{url}?page=' channel_url = f"{url}?page="
while True: while True:
url = channel_url + str(page) url = channel_url + str(page)
@@ -141,37 +152,38 @@ def get_channel_videos(url):
if r.status_code == 404: if r.status_code == 404:
break break
soup = BeautifulSoup(r.content, features = 'html.parser') soup = BeautifulSoup(r.content, features="html.parser")
video_list = soup.find_all('li', {'class' : 'video-listing-entry'}) video_list = soup.find_all("li", {"class": "video-listing-entry"})
for video in video_list: for video in video_list:
yield process_video(video) yield process_video(video)
page += 1 page += 1
def get_channel_profile(url): def get_channel_profile(url):
channel_url = f"{url}"
channel_url = f'{url}'
r = make_request(url=channel_url) r = make_request(url=channel_url)
soup = BeautifulSoup(r.content, features = 'lxml') soup = BeautifulSoup(r.content, features="lxml")
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'}) verified_svg = soup.find("h1").find("svg", {"class": "listing-header--verified"})
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'}) thumbnail_soup = soup.find("img", {"class": "listing-header--thumb"})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'}) cover_soup = soup.find("img", {"class": "listing-header--backsplash-img"})
author_a = soup.find('a', {'rel': 'author'}) author_a = soup.find("a", {"rel": "author"})
if author_a is None: if author_a is None:
author_id = None author_id = None
else: else:
author_id = author_a['href'].split('/')[-1] author_id = author_a["href"].split("/")[-1]
profile = { profile = {
'name': soup.find('h1').text, "name": soup.find("h1").text,
'id': author_id, "id": author_id,
'verified': verified_svg is not None, "verified": verified_svg is not None,
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, "thumbnail": thumbnail_soup.get("src") if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None, "cover": cover_soup.get("src") if cover_soup else None,
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} "subscribers": soup.find("span", {"class": "subscribe-button-count"}).text,
}
return profile return profile

View File

@@ -14,19 +14,21 @@ from telethon.tl import types
from cisticola.base import Channel, ScraperResult, RawChannelInfo from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper from cisticola.scraper.base import Scraper
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage'] MEDIA_TYPES = ["photo", "video", "document", "webpage"]
class TelegramTelethonScraper(Scraper): class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library""" """An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.4" __version__ = "TelegramTelethonScraper 0.0.4"
client = None client = None
def __init__(self, telethon_session_name=None): def __init__(self, telethon_session_name=None):
super().__init__() super().__init__()
api_id = os.environ['TELEGRAM_API_ID'] api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ['TELEGRAM_API_HASH'] api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ['TELEGRAM_PHONE'] phone = os.environ["TELEGRAM_PHONE"]
if telethon_session_name is None: if telethon_session_name is None:
telethon_session_name = phone telethon_session_name = phone
@@ -40,9 +42,9 @@ class TelegramTelethonScraper(Scraper):
self.client.disconnect() self.client.disconnect()
def get_username_from_url(url): def get_username_from_url(url):
username = url.split('https://t.me/')[1] username = url.split("https://t.me/")[1]
if username.startswith('s/'): if username.startswith("s/"):
username = username.split('s/')[1] username = username.split("s/")[1]
return username return username
def get_channel_identifier(channel: Channel): def get_channel_identifier(channel: Channel):
@@ -63,14 +65,18 @@ class TelegramTelethonScraper(Scraper):
return result return result
if len(list(result.archived_urls.keys())) != 1: if len(list(result.archived_urls.keys())) != 1:
logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}") logger.warning(
f"Expected 1 key in archived_urls, found {result.archived_keys}"
)
else: else:
key = list(result.archived_urls.keys())[0] key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None: if result.archived_urls[key] is None:
raw = json.loads(result.raw_data) raw = json.loads(result.raw_data)
message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']]) message = self.client.get_messages(
raw["peer_id"]["channel_id"], ids=[raw["id"]]
)
blob = None blob = None
output_file_with_ext = None output_file_with_ext = None
@@ -81,12 +87,16 @@ class TelegramTelethonScraper(Scraper):
if blob is not None: if blob is not None:
# TODO specify Content-Type # TODO specify Content-Type
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext) archived_url = self.archive_blob(
blob=blob, content_type="", key=output_file_with_ext
)
result.archived_urls[key] = archived_url result.archived_urls[key] = archived_url
result.media_archived = datetime.now(timezone.utc) result.media_archived = datetime.now(timezone.utc)
else: else:
if output_file_with_ext == 'largefile': if output_file_with_ext == "largefile":
logger.info("Because this was a large file, not clearing media data") logger.info(
"Because this was a large file, not clearing media data"
)
return result return result
logger.warning("Downloaded blob was None") logger.warning("Downloaded blob was None")
@@ -102,14 +112,18 @@ class TelegramTelethonScraper(Scraper):
if type(post.media) == types.MessageMediaDocument: if type(post.media) == types.MessageMediaDocument:
if post.media.document.size / (1024 * 1024) > 50: if post.media.document.size / (1024 * 1024) > 50:
logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") logger.info(
f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
)
return (None, "largefile") return (None, "largefile")
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB") logger.debug(
f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
)
else: else:
logger.debug(f"Archiving {type(post.media)}") logger.debug(f"Archiving {type(post.media)}")
key = f'{post.peer_id.channel_id}_{post.id}' key = f"{post.peer_id.channel_id}_{post.id}"
with tempfile.TemporaryDirectory() as temp_dir: with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key) output_file = Path(temp_dir, key)
@@ -123,7 +137,7 @@ class TelegramTelethonScraper(Scraper):
output_file_with_ext = os.listdir(temp_dir)[0] output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext) filename = Path(temp_dir, output_file_with_ext)
with open(filename, 'rb') as f: with open(filename, "rb") as f:
blob = f.read() blob = f.read()
return (blob, output_file_with_ext) return (blob, output_file_with_ext)
@@ -132,22 +146,35 @@ class TelegramTelethonScraper(Scraper):
return True return True
# @logger.catch # @logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]: def get_posts(
self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None
) -> Generator[ScraperResult, None, None]:
username = TelegramTelethonScraper.get_channel_identifier(channel) username = TelegramTelethonScraper.get_channel_identifier(channel)
if until is not None: if until is not None:
logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}") logger.info(
iterator = self.client.iter_messages(username, max_id=int(until.platform_id.split('/')[-1]), wait_time=0, limit=None) f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}"
)
iterator = self.client.iter_messages(
username,
max_id=int(until.platform_id.split("/")[-1]),
wait_time=0,
limit=None,
)
else: else:
iterator = self.client.iter_messages(username) iterator = self.client.iter_messages(username)
post = None post = None
for post in iterator: for post in iterator:
post_url = f'{channel.url}/{post.id}' post_url = f"{channel.url}/{post.id}"
logger.trace(f"Archiving post {post_url} from {post.date}") logger.trace(f"Archiving post {post_url} from {post.date}")
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc): if since is not None and post.date.replace(
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}') tzinfo=timezone.utc
) <= since.date.replace(tzinfo=timezone.utc):
logger.info(
f"Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}"
)
break break
archived_urls = {} archived_urls = {}
@@ -166,10 +193,18 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str), raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=media_archived) media_archived=media_archived,
)
if (post is not None and post.id > 1 and since is None) or (post is not None and since is not None and post.date.replace(tzinfo=timezone.utc) > since.date.replace(tzinfo=timezone.utc)): if (post is not None and post.id > 1 and since is None) or (
logger.info(f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again") post is not None
and since is not None
and post.date.replace(tzinfo=timezone.utc)
> since.date.replace(tzinfo=timezone.utc)
):
logger.info(
f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again"
)
new_until = ScraperResult( new_until = ScraperResult(
scraper=self.__version__, scraper=self.__version__,
platform="Telegram", platform="Telegram",
@@ -179,19 +214,21 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc), date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str), raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls, archived_urls=archived_urls,
media_archived=media_archived) media_archived=media_archived,
)
for p in self.get_posts(channel, since=since, until=new_until): for p in self.get_posts(channel, since=since, until=new_until):
yield p yield p
@logger.catch @logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo: def get_profile(self, channel: Channel) -> RawChannelInfo:
username = TelegramTelethonScraper.get_channel_identifier(channel) username = TelegramTelethonScraper.get_channel_identifier(channel)
full_channel = self.client(GetFullChannelRequest(channel=username)) full_channel = self.client(GetFullChannelRequest(channel=username))
profile = full_channel.to_dict() profile = full_channel.to_dict()
return RawChannelInfo(scraper=self.__version__, return RawChannelInfo(
scraper=self.__version__,
platform=channel.platform, platform=channel.platform,
channel=channel.id, channel=channel.id,
raw_data=json.dumps(profile, default=str), raw_data=json.dumps(profile, default=str),
date_archived=datetime.now(timezone.utc)) date_archived=datetime.now(timezone.utc),
)

View File

@@ -7,7 +7,18 @@ from sqlalchemy.sql.expression import func
from collections import defaultdict from collections import defaultdict
from datetime import datetime, timezone from datetime import datetime, timezone
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry, Image, Video, Audio from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Media,
Channel,
mapper_registry,
Image,
Video,
Audio,
)
class Transformer: class Transformer:
@@ -35,7 +46,9 @@ class Transformer:
pass pass
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]: def transform(
data: ScraperResult, insert: Callable
) -> Generator[Union[Post, Channel, Media], None, None]:
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can """Transform a ScraperResult into objects with additional parameters for analysis. This function can
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
objects and provide all of these to be inserted into the database. objects and provide all of these to be inserted into the database.
@@ -67,16 +80,27 @@ class Transformer:
for k in data.archived_urls: for k in data.archived_urls:
if data.archived_urls[k]: if data.archived_urls[k]:
archived_url = data.archived_urls[k] archived_url = data.archived_urls[k]
filename = archived_url.split('/')[-1] filename = archived_url.split("/")[-1]
ext = None if '.' not in filename else filename.split('.')[-1].lower() ext = None if "." not in filename else filename.split(".")[-1].lower()
media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform) media_kwargs = dict(
url=archived_url,
post=transformed.id,
raw_id=data.id,
original_url=k,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
transformer=self.__version__,
scraper=data.scraper,
platform=data.platform,
)
if ext in ('mp4', 'mov', 'avi', 'mkv'): if ext in ("mp4", "mov", "avi", "mkv"):
media_class = Video media_class = Video
elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'): elif ext in ("oga", "mp3", "wav", "aif", "aiff", "aac"):
media_class = Audio media_class = Audio
elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'): elif ext in ("jpg", "jpeg", "png", "gif", "bmp", "heic", "tiff"):
media_class = Image media_class = Image
else: else:
logger.warning(f"Unknown file extension {ext}") logger.warning(f"Unknown file extension {ext}")
@@ -202,11 +226,31 @@ class ETLController:
# This is using some adhoc unique constraints that might be worth formalizing at some point # This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel: if type(obj) == Channel:
instance = session.query(Channel).filter( instance = (
(((Channel.url==obj.url)&(Channel.url!='')&(Channel.url is not None)&(Channel.url!='https://t.me/s/'))| session.query(Channel)
((Channel.platform_id==str(obj.platform_id))&(Channel.platform_id!='')&(Channel.platform_id is not None))| .filter(
((Channel.screenname==obj.screenname)&(Channel.screenname!='')&(Channel.screenname is not None)))& (
(Channel.platform==obj.platform)).first() (
(Channel.url == obj.url)
& (Channel.url != "")
& (Channel.url is not None)
& (Channel.url != "https://t.me/s/")
)
| (
(Channel.platform_id == str(obj.platform_id))
& (Channel.platform_id != "")
& (Channel.platform_id is not None)
)
| (
(Channel.screenname == obj.screenname)
& (Channel.screenname != "")
& (Channel.screenname is not None)
)
)
& (Channel.platform == obj.platform)
)
.first()
)
elif type(obj) == Post: elif type(obj) == Post:
return self.insert_post(obj, session, hydrate) return self.insert_post(obj, session, hydrate)
@@ -240,7 +284,12 @@ class ETLController:
logger.info(f"Found matching DB entry for {obj}: {instance}") logger.info(f"Found matching DB entry for {obj}: {instance}")
if type(obj) == Channel: if type(obj) == Channel:
if obj.source != instance.source and obj.source == 'linked_channel' and instance.source != 'researcher' and (instance.source is None or instance.source[:4] != 'snow'): if (
obj.source != instance.source
and obj.source == "linked_channel"
and instance.source != "researcher"
and (instance.source is None or instance.source[:4] != "snow")
):
logger.info(f"Updating source to linked channel") logger.info(f"Updating source to linked channel")
instance.source = obj.source instance.source = obj.source
instance.notes = obj.notes instance.notes = obj.notes
@@ -251,7 +300,7 @@ class ETLController:
session.flush() session.flush()
session.commit() session.commit()
if (instance.platform_id is None or instance.platform_id == ''): if instance.platform_id is None or instance.platform_id == "":
instance.platform_id = obj.platform_id instance.platform_id = obj.platform_id
session.flush() session.flush()
session.commit() session.commit()
@@ -293,22 +342,35 @@ class ETLController:
handled = False handled = False
if transformer.can_handle(result): if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})") logger.trace(
f"{transformer} is handling result {result.id} ({result.date})"
)
handled = True handled = True
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session, lambda obj: self.insert_post(obj, session, hydrate, flush=False), lambda: self.flush_posts(session)) transformer.transform(
result,
lambda obj: self.insert_or_select(obj, session, hydrate),
session,
lambda obj: self.insert_post(
obj, session, hydrate, flush=False
),
lambda: self.flush_posts(session),
)
break break
if handled == False: if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") logger.warning(
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
)
self.flush_posts(session) self.flush_posts(session)
session.commit() session.commit()
@logger.catch(reraise=True) @logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True, min_date=datetime(2010, 1, 1)): def transform_all_untransformed(
self, hydrate: bool = True, min_date=datetime(2010, 1, 1)
):
"""Transform all ScraperResult objects in the database that do not have an """Transform all ScraperResult objects in the database that do not have an
equivalent Post object stored. equivalent Post object stored.
@@ -331,7 +393,8 @@ class ETLController:
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}") logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
batch = (session.query(ScraperResult) batch = (
session.query(ScraperResult)
.join(Post, isouter=True) .join(Post, isouter=True)
.where(ScraperResult.date > min_date) .where(ScraperResult.date > min_date)
.where(Post.raw_id == None) .where(Post.raw_id == None)
@@ -344,9 +407,12 @@ class ETLController:
self.transform_results(batch, hydrate=hydrate) self.transform_results(batch, hydrate=hydrate)
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}") logger.info(
f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}"
)
batch = (session.query(ScraperResult) batch = (
session.query(ScraperResult)
.join(Post, isouter=True) .join(Post, isouter=True)
.where(ScraperResult.date > min_date) .where(ScraperResult.date > min_date)
.where(Post.raw_id == None) .where(Post.raw_id == None)
@@ -356,7 +422,6 @@ class ETLController:
.limit(BATCH_SIZE) .limit(BATCH_SIZE)
).all() ).all()
@logger.catch(reraise=True) @logger.catch(reraise=True)
def transform_info(self, results: List[ChannelInfo]): def transform_info(self, results: List[ChannelInfo]):
"""Transform raw RawChannelInfo objects into ChannelInfo objects. """Transform raw RawChannelInfo objects into ChannelInfo objects.
@@ -380,17 +445,25 @@ class ETLController:
for transformer in self.transformers: for transformer in self.transformers:
if transformer.can_handle(result): if transformer.can_handle(result):
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})") logger.trace(
f"{transformer} is handling raw info result {result.id} ({result.date_archived})"
)
handled = True handled = True
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session, channel=data.Channel) transformer.transform_info(
result,
lambda obj: self.insert_or_select(obj, session, False),
session,
channel=data.Channel,
)
session.commit() session.commit()
break break
if handled == False: if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})") logger.warning(
f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})"
)
@logger.catch(reraise=True) @logger.catch(reraise=True)
def transform_all_untransformed_info(self): def transform_all_untransformed_info(self):
@@ -407,7 +480,8 @@ class ETLController:
offset = 0 offset = 0
batch = [] batch = []
query = (session.query(RawChannelInfo, Channel) query = (
session.query(RawChannelInfo, Channel)
.select_from(RawChannelInfo) .select_from(RawChannelInfo)
.join(ChannelInfo, isouter=True) .join(ChannelInfo, isouter=True)
.join(Channel, RawChannelInfo.channel == Channel.id) .join(Channel, RawChannelInfo.channel == Channel.id)
@@ -416,12 +490,16 @@ class ETLController:
) )
while len(batch) > 0 or offset == 0: while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}") logger.info(
f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}"
)
batch = query.slice(offset, offset + BATCH_SIZE).all() batch = query.slice(offset, offset + BATCH_SIZE).all()
offset += BATCH_SIZE offset += BATCH_SIZE
logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)") logger.info(
f"Found {len(batch)} info items to ETL ({offset} already processed)"
)
self.transform_info(batch) self.transform_info(batch)
@@ -450,16 +528,24 @@ class ETLController:
handled = False handled = False
if transformer.can_handle(result): if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})") logger.trace(
f"{transformer} is handling result {result.id} ({result.date})"
)
handled = True handled = True
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate)) transformer.transform_media(
result,
total_result.Post,
lambda obj: self.insert_or_select(obj, session, hydrate),
)
session.commit() session.commit()
break break
if handled == False: if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})") logger.warning(
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
)
@logger.catch(reraise=True) @logger.catch(reraise=True)
def transform_all_untransformed_media(self, hydrate=True): def transform_all_untransformed_media(self, hydrate=True):
@@ -482,10 +568,15 @@ class ETLController:
logger.info(f"Fetching first untransformed post media batch of {BATCH_SIZE}") logger.info(f"Fetching first untransformed post media batch of {BATCH_SIZE}")
batch = (session.query(ScraperResult, Post) batch = (
session.query(ScraperResult, Post)
.join(Post) .join(Post)
.join(Media, isouter=True) .join(Media, isouter=True)
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None)) .filter(
(ScraperResult.media_archived != None)
& (cast(ScraperResult.archived_urls, String) != "{}")
& (Media.id == None)
)
.order_by(ScraperResult.date.desc()) .order_by(ScraperResult.date.desc())
.limit(BATCH_SIZE) .limit(BATCH_SIZE)
).all() ).all()
@@ -495,13 +586,23 @@ class ETLController:
self.transform_media(batch, hydrate=hydrate) self.transform_media(batch, hydrate=hydrate)
logger.info(f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}") logger.info(
f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}"
)
batch = (session.query(ScraperResult, Post) batch = (
session.query(ScraperResult, Post)
.join(Post) .join(Post)
.join(Media, isouter=True) .join(Media, isouter=True)
.where(ScraperResult.date <= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date) .where(
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None)) ScraperResult.date
<= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date
)
.filter(
(ScraperResult.media_archived != None)
& (cast(ScraperResult.archived_urls, String) != "{}")
& (Media.id == None)
)
.order_by(ScraperResult.date.desc()) .order_by(ScraperResult.date.desc())
.limit(BATCH_SIZE) .limit(BATCH_SIZE)
).all() ).all()

View File

@@ -7,7 +7,17 @@ from dateutil.relativedelta import relativedelta
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo from cisticola.base import (
RawChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
ChannelInfo,
)
class BitchuteTransformer(Transformer): class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming""" """A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -15,61 +25,86 @@ class BitchuteTransformer(Transformer):
__version__ = "BitchuteTransformer 0.0.2" __version__ = "BitchuteTransformer 0.0.2"
def can_handle(self, data: ScraperResult) -> bool: def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ') scraper = data.scraper.split(" ")
if scraper[0] == "BitchuteScraper": if scraper[0] == "BitchuteScraper":
return True return True
return False return False
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable) -> Generator[Media, None, None]: def transform_media(
self, data: ScraperResult, transformed: Post, insert: Callable
) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
orig = raw['video_url'] orig = raw["video_url"]
new = data.archived_urls[orig] new = data.archived_urls[orig]
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform) m = Video(
url=new,
post=transformed.id,
raw_id=data.id,
original_url=orig,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
transformer=self.__version__,
scraper=data.scraper,
platform=data.platform,
)
insert(m) insert(m)
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
transformed = ChannelInfo( transformed = ChannelInfo(
raw_channel_info_id=data.id, raw_channel_info_id=data.id,
channel=data.channel, channel=data.channel,
platform_id=raw['owner_url'].strip('/').split('/')[-1], platform_id=raw["owner_url"].strip("/").split("/")[-1],
platform=data.platform, platform=data.platform,
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
screenname=raw['owner_name'], screenname=raw["owner_name"],
name=raw['owner_name'], name=raw["owner_name"],
description=raw['description'], description=raw["description"],
description_url='', # does not exist for Bitchute description_url="", # does not exist for Bitchute
description_location='', # does not exist for Bitchute description_location="", # does not exist for Bitchute
followers=raw['subscribers'], followers=raw["subscribers"],
following=-1, # does not exist for Bitchute following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute verified=False, # does not exist for Bitchute
date_created=parse_created(raw['created'], data.date_archived), date_created=parse_created(raw["created"], data.date_archived),
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc) date_transformed=datetime.now(timezone.utc),
) )
transformed = insert(transformed) transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if raw['category'] == 'comment': if raw["category"] == "comment":
if raw['parent_id'] is None: if raw["parent_id"] is None:
reply_to_id = raw['thread_id'] reply_to_id = raw["thread_id"]
else: else:
reply_to_id = raw['parent_id'] reply_to_id = raw["parent_id"]
flush_posts() flush_posts()
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id)
.first()
)
if post is None: if post is None:
if raw['parent_id'] is not None: if raw["parent_id"] is not None:
# this block is for comments whose parent_ids correspond to deleted comments # this block is for comments whose parent_ids correspond to deleted comments
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first() post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=raw["thread_id"])
.first()
)
if post is None: if post is None:
reply_to = -1 reply_to = -1
else: else:
@@ -78,18 +113,18 @@ class BitchuteTransformer(Transformer):
reply_to = -1 reply_to = -1
else: else:
reply_to = post.id reply_to = post.id
content = raw['body'].strip() content = raw["body"].strip()
else: else:
reply_to = -1 reply_to = -1
soup = BeautifulSoup(raw['body'], features = 'html.parser') soup = BeautifulSoup(raw["body"], features="html.parser")
soup.find('div', {'class': 'teaser'}).decompose() soup.find("div", {"class": "teaser"}).decompose()
soup.find('span', {'class': 'more'}).decompose() soup.find("span", {"class": "more"}).decompose()
soup.find('span', {'class': 'less hidden'}).decompose() soup.find("span", {"class": "less hidden"}).decompose()
content = soup.text.strip() content = soup.text.strip()
transformed = Post( transformed = Post(
raw_id=data.id, raw_id=data.id,
platform_id=raw['id'], platform_id=raw["id"],
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
platform=data.platform, platform=data.platform,
@@ -97,20 +132,24 @@ class BitchuteTransformer(Transformer):
date=data.date, date=data.date,
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc), date_transformed=datetime.now(timezone.utc),
url=raw['url'] if raw['url'] else None, url=raw["url"] if raw["url"] else None,
content=content, content=content,
author_id=raw['author_id'], author_id=raw["author_id"],
author_username=raw['author'], author_username=raw["author"],
reply_to=reply_to, reply_to=reply_to,
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])), hashtags=list(
likes = raw['likes'], filter(None, [h.strip("#") for h in raw["hashtags"].split(",")])
views = int(raw['views']) if raw.get('views') else None, ),
video_title = raw['subject'], likes=raw["likes"],
video_duration = _parse_duration_str(raw['length'])) views=int(raw["views"]) if raw.get("views") else None,
video_title=raw["subject"],
video_duration=_parse_duration_str(raw["length"]),
)
# insert_post # insert_post
transformed = insert_post(transformed) transformed = insert_post(transformed)
def parse_created(created: str, date_archived: datetime) -> datetime: def parse_created(created: str, date_archived: datetime) -> datetime:
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime """Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
object relative to the specified ``date_archived``. object relative to the specified ``date_archived``.
@@ -119,19 +158,26 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
# handle case where `created` string has already been parsed into a datetime # handle case where `created` string has already been parsed into a datetime
return datetime.fromisoformat(created) return datetime.fromisoformat(created)
except ValueError: except ValueError:
period_list = ['year', 'month', 'week', 'day'] period_list = ["year", "month", "week", "day"]
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')] periods = [
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} period.strip() for period in created.split("ago")[0].strip().split(",")
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} ]
_kwargs = {
period: int(number)
for period, number in dict(reversed(p.split(" ")) for p in periods).items()
}
kwargs = {(k + "s" if k in period_list else k): v for k, v in _kwargs.items()}
return date_archived - relativedelta(**kwargs) return date_archived - relativedelta(**kwargs)
def _parse_duration_str(duration_str: str) -> int: def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
"""
if not duration_str: if not duration_str:
return None return None
else: else:
duration_list = duration_str.split(':') duration_list = duration_str.split(":")
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) return sum(
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
)

View File

@@ -8,7 +8,17 @@ from gogettr import PublicClient
from gogettr.api import GettrApiError from gogettr.api import GettrApiError
from cisticola.transformer.base import Transformer from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
)
class GettrTransformer(Transformer): class GettrTransformer(Transformer):
"""A Gettr specific ScraperResult, with a method ETL/transforming""" """A Gettr specific ScraperResult, with a method ETL/transforming"""
@@ -16,50 +26,58 @@ class GettrTransformer(Transformer):
__version__ = "GettrTransformer 0.0.1" __version__ = "GettrTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool: def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ') scraper = data.scraper.split(" ")
if scraper[0] == "GettrScraper": if scraper[0] == "GettrScraper":
return True return True
return False return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
transformed = ChannelInfo( transformed = ChannelInfo(
raw_channel_info_id=data.id, raw_channel_info_id=data.id,
channel=data.channel, channel=data.channel,
platform_id=raw['_id'], platform_id=raw["_id"],
platform=data.platform, platform=data.platform,
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
screenname=raw['username'], screenname=raw["username"],
name=raw['nickname'], name=raw["nickname"],
description=raw.get('dsc'), description=raw.get("dsc"),
description_url=raw.get('website'), description_url=raw.get("website"),
description_location=raw.get('location'), description_location=raw.get("location"),
followers=int(raw['flg']), followers=int(raw["flg"]),
following=int(raw['flw']), following=int(raw["flw"]),
verified=True if raw.get('infl') else False, verified=True if raw.get("infl") else False,
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001), date_created=datetime.fromtimestamp(int(raw["cdate"]) * 0.001),
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc) date_transformed=datetime.now(timezone.utc),
) )
transformed = insert(transformed) transformed = insert(transformed)
def _get_channel_id(self, username: str, category: str, insert: Callable, session): def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = (
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first() session.query(Channel)
.where(
(func.lower(Channel.screenname) == func.lower(username))
& (Channel.platform == "Gettr")
)
.first()
)
if channel is None: if channel is None:
try: try:
client = PublicClient() client = PublicClient()
profile = client.user_info(username.lower()) profile = client.user_info(username.lower())
screenname = profile.get('_id') screenname = profile.get("_id")
channel = Channel( channel = Channel(
name=profile.get('nickname'), name=profile.get("nickname"),
platform_id=screenname, platform_id=screenname,
platform='Gettr', platform="Gettr",
url="https://gettr.com/user/" + screenname, url="https://gettr.com/user/" + screenname,
screenname=screenname, screenname=screenname,
category=category, category=category,
@@ -69,31 +87,41 @@ class GettrTransformer(Transformer):
channel = Channel( channel = Channel(
name=None, name=None,
platform_id=None, platform_id=None,
platform = 'Gettr', platform="Gettr",
url=None, url=None,
screenname=username, screenname=username,
category=category, category=category,
source=self.__version__, source=self.__version__,
notes='GettrApiError' notes="GettrApiError",
) )
channel = insert(channel) channel = insert(channel)
return channel.id return channel.id
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if raw["activity"]["action"] == "shares_pst": if raw["activity"]["action"] == "shares_pst":
forwarded_from = self._get_channel_id( forwarded_from = self._get_channel_id(
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session) username=str(raw["activity"]["uid"]),
category="forwarded",
insert=insert,
session=session,
)
else: else:
forwarded_from = None forwarded_from = None
mentions = [] mentions = []
for mentioned_user in raw.get("utgs", []): for mentioned_user in raw.get("utgs", []):
mentioned_id = self._get_channel_id( mentioned_id = self._get_channel_id(
username = mentioned_user, category = 'mentioned', insert = insert, session = session) username=mentioned_user,
category="mentioned",
insert=insert,
session=session,
)
mentions.append(mentioned_id) mentions.append(mentioned_id)
transformed = Post( transformed = Post(
@@ -114,9 +142,9 @@ class GettrTransformer(Transformer):
outlinks=list(filter(None, [raw.get("prevsrc")])), outlinks=list(filter(None, [raw.get("prevsrc")])),
forwarded_from=forwarded_from, forwarded_from=forwarded_from,
mentions=mentions, mentions=mentions,
likes = raw.get('lkbpst'), likes=raw.get("lkbpst"),
forwards=raw.get("shbpst"), forwards=raw.get("shbpst"),
views = raw.get('vfpst') views=raw.get("vfpst"),
) )
# insert_post # insert_post

View File

@@ -6,7 +6,17 @@ from datetime import datetime, timezone
from sqlalchemy import func, JSON, String, cast, text from sqlalchemy import func, JSON, String, cast, text
from cisticola.transformer.base import Transformer from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Media,
Channel,
)
class RumbleTransformer(Transformer): class RumbleTransformer(Transformer):
"""A Rumble specific ScraperResult, with a method ETL/transforming""" """A Rumble specific ScraperResult, with a method ETL/transforming"""
@@ -14,25 +24,36 @@ class RumbleTransformer(Transformer):
__version__ = "RumbleTransformer 0.0.1" __version__ = "RumbleTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool: def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ') scraper = data.scraper.split(" ")
if scraper[0] == "RumbleScraper": if scraper[0] == "RumbleScraper":
return True return True
return False return False
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if 'id' not in raw: if "id" not in raw:
# The first version of the Rumble ChannelInfo scraper didn't return # The first version of the Rumble ChannelInfo scraper didn't return
# the platform_id, so this is a workaround. # the platform_id, so this is a workaround.
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first() channel = (
session.query(RawChannelInfo)
.filter(
text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"),
RawChannelInfo.platform == "Rumble",
)
.params(name=raw["name"])
.order_by(RawChannelInfo.date_archived.desc())
.first()
)
if channel is None: if channel is None:
platform_id = None platform_id = None
else: else:
platform_id = json.loads(channel.raw_data)['id'] platform_id = json.loads(channel.raw_data)["id"]
else: else:
platform_id = raw['id'] platform_id = raw["id"]
transformed = ChannelInfo( transformed = ChannelInfo(
raw_channel_info_id=data.id, raw_channel_info_id=data.id,
@@ -42,63 +63,67 @@ class RumbleTransformer(Transformer):
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
screenname=platform_id, screenname=platform_id,
name=raw['name'], name=raw["name"],
description='', # does not exist for Rumble description="", # does not exist for Rumble
description_url='', # does not exist for Rumble description_url="", # does not exist for Rumble
description_location='', # does not exist for Rumble description_location="", # does not exist for Rumble
followers=_process_number(raw['subscribers']), followers=_process_number(raw["subscribers"]),
following=-1, # does not exist for Rumble following=-1, # does not exist for Rumble
verified=raw['verified'], verified=raw["verified"],
date_created=None, # does not exist for Rumble date_created=None, # does not exist for Rumble
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc) date_transformed=datetime.now(timezone.utc),
) )
transformed = insert(transformed) transformed = insert(transformed)
def transform(
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
transformed = Post( transformed = Post(
raw_id=data.id, raw_id=data.id,
platform_id=raw['media_url'].strip('/').split('/')[-1], platform_id=raw["media_url"].strip("/").split("/")[-1],
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
platform=data.platform, platform=data.platform,
channel=data.channel, channel=data.channel,
date=dateutil.parser.parse(raw['datetime']), date=dateutil.parser.parse(raw["datetime"]),
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc), date_transformed=datetime.now(timezone.utc),
url=raw['link'], url=raw["link"],
content=raw['content'], content=raw["content"],
author_id=raw['author_id'], author_id=raw["author_id"],
author_username=raw['author_name'], author_username=raw["author_name"],
views = _process_number(raw.get('views')), views=_process_number(raw.get("views")),
likes = _process_number(raw.get('rumbles')), likes=_process_number(raw.get("rumbles")),
video_title = raw['title'], video_title=raw["title"],
video_duration=_parse_duration_str(raw['duration'])) video_duration=_parse_duration_str(raw["duration"]),
)
# insert_post # insert_post
insert_post(transformed) insert_post(transformed)
def _process_number(s):
def _process_number(s):
if s is None: if s is None:
return None return None
else: else:
s = s.replace(' ', '').replace(',','') s = s.replace(" ", "").replace(",", "")
if s.endswith('M'): if s.endswith("M"):
return int(float(s[:-1]) * 1e6) return int(float(s[:-1]) * 1e6)
elif s.endswith('K'): elif s.endswith("K"):
return int(float(s[:-1]) * 1000) return int(float(s[:-1]) * 1000)
return int(s) return int(s)
def _parse_duration_str(duration_str: str) -> int: def _parse_duration_str(duration_str: str) -> int:
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824). """Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
"""
if not duration_str: if not duration_str:
return None return None
else: else:
duration_list = duration_str.split(':') duration_list = duration_str.split(":")
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]) return sum(
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
)

View File

@@ -17,11 +17,21 @@ from datetime import datetime, timezone
from sqlalchemy import func from sqlalchemy import func
from cisticola.transformer.base import Transformer from cisticola.transformer.base import Transformer
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Audio, Media, Channel from cisticola.base import (
RawChannelInfo,
ChannelInfo,
ScraperResult,
Post,
Image,
Video,
Audio,
Media,
Channel,
)
class TelegramTelethonTransformer(Transformer): class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.4' __version__ = "TelegramTelethonTransformer 0.0.4"
# TODO cache # TODO cache
# cache channels for which we cannot get the name from the web interface # cache channels for which we cannot get the name from the web interface
@@ -38,7 +48,7 @@ class TelegramTelethonTransformer(Transformer):
get_screenname_cache = {} get_screenname_cache = {}
def can_handle(self, data: ScraperResult) -> bool: def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ') scraper = data.scraper.split(" ")
if scraper[0] == "TelegramTelethonScraper": if scraper[0] == "TelegramTelethonScraper":
return True return True
@@ -47,9 +57,9 @@ class TelegramTelethonTransformer(Transformer):
def __init__(self, telethon_session_name=None): def __init__(self, telethon_session_name=None):
super().__init__() super().__init__()
api_id = os.environ['TELEGRAM_API_ID'] api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ['TELEGRAM_API_HASH'] api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ['TELEGRAM_PHONE'] phone = os.environ["TELEGRAM_PHONE"]
if telethon_session_name is None: if telethon_session_name is None:
telethon_session_name = phone telethon_session_name = phone
@@ -67,7 +77,11 @@ class TelegramTelethonTransformer(Transformer):
try: try:
data = self.client.get_entity(channel_id) data = self.client.get_entity(channel_id)
if isinstance(data, types.User): if isinstance(data, types.User):
output = (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "") output = (
data.username,
str(data.first_name or "") + " " + str(data.last_name or ""),
"",
)
else: else:
output = (data.username, data.title, "") output = (data.username, data.title, "")
except ChannelPrivateError: except ChannelPrivateError:
@@ -85,7 +99,9 @@ class TelegramTelethonTransformer(Transformer):
# this doesn't work for chat channels # this doesn't work for chat channels
if orig_screenname in self.bad_channels: if orig_screenname in self.bad_channels:
logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}") logger.debug(
f"Skipping screenname because it is not accessible for channel {orig_screenname}"
)
return "" return ""
logger.info(f"Finding channel from URL {url}") logger.info(f"Finding channel from URL {url}")
@@ -95,7 +111,7 @@ class TelegramTelethonTransformer(Transformer):
self.bad_channels[orig_screenname] = True self.bad_channels[orig_screenname] = True
return "" return ""
soup = BeautifulSoup(r.content, features = 'lxml') soup = BeautifulSoup(r.content, features="lxml")
post = soup.findAll("div", {"data-post": orig_screenname + "/" + str(id)}) post = soup.findAll("div", {"data-post": orig_screenname + "/" + str(id)})
name = "" name = ""
@@ -106,127 +122,173 @@ class TelegramTelethonTransformer(Transformer):
if decrement > 8: if decrement > 8:
break break
logger.info(f"Could not find post from {url}, looking for id {id - decrement}") logger.info(
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)}) f"Could not find post from {url}, looking for id {id - decrement}"
)
post = soup.findAll(
"div", {"data-post": orig_screenname + "/" + str(id - decrement)}
)
if len(post) == 0: if len(post) == 0:
logger.warning(f"Could not find post from {url}") logger.warning(f"Could not find post from {url}")
else: else:
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"}) fwd_tag = post[0].findAll(
"a", {"class", "tgme_widget_message_forwarded_from_name"}
)
if len(fwd_tag) == 0: if len(fwd_tag) == 0:
fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"}) fwd_tag = post[0].findAll(
"span", {"class", "tgme_widget_message_forwarded_from_name"}
)
if len(fwd_tag) >= 1: if len(fwd_tag) >= 1:
name = fwd_tag[0].text name = fwd_tag[0].text
return name return name
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]: def transform_info(
self, data: RawChannelInfo, insert: Callable, session, channel=None
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
chat_raw = raw['chats'][0] chat_raw = raw["chats"][0]
transformed = ChannelInfo( transformed = ChannelInfo(
raw_channel_info_id=data.id, raw_channel_info_id=data.id,
channel=data.channel, channel=data.channel,
platform_id=raw['full_chat']['id'], platform_id=raw["full_chat"]["id"],
platform=data.platform, platform=data.platform,
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
screenname=chat_raw['username'], screenname=chat_raw["username"],
name=chat_raw['title'], name=chat_raw["title"],
description=raw['full_chat']['about'], description=raw["full_chat"]["about"],
description_url='', # does not exist for Telegram description_url="", # does not exist for Telegram
description_location='', # does not exist for Telegram description_location="", # does not exist for Telegram
followers=raw['full_chat']['participants_count'], followers=raw["full_chat"]["participants_count"],
following=-1, # does not exist for Telegram following=-1, # does not exist for Telegram
verified=False, # does not exist for Telegram verified=False, # does not exist for Telegram
date_created=dateutil.parser.parse(chat_raw['date']), date_created=dateutil.parser.parse(chat_raw["date"]),
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc) date_transformed=datetime.now(timezone.utc),
) )
transformed = insert(transformed) transformed = insert(transformed)
if channel.platform_id is None: if channel.platform_id is None:
logger.info(f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}") logger.info(
f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}"
)
new_channel = session.query(Channel).where(Channel.id == channel.id).one() new_channel = session.query(Channel).where(Channel.id == channel.id).one()
new_channel.platform_id = raw['full_chat']['id'] new_channel.platform_id = raw["full_chat"]["id"]
session.flush() session.flush()
session.commit() session.commit()
if len(raw['chats']) > 1: if len(raw["chats"]) > 1:
for chat in raw['chats'][1:]: for chat in raw["chats"][1:]:
new_chat = Channel( new_chat = Channel(
name=chat["title"], name=chat["title"],
platform_id=chat["id"], platform_id=chat["id"],
category=channel.category, # this should be the same as the "parent" category=channel.category, # this should be the same as the "parent"
platform=channel.platform, # this should be the same as the "parent" platform=channel.platform, # this should be the same as the "parent"
url=("https://t.me/s/" + chat["username"]) if "username" in chat else "", url=("https://t.me/s/" + chat["username"])
if "username" in chat
else "",
screenname=chat["username"] if "username" in chat else "", screenname=chat["username"] if "username" in chat else "",
country=channel.country, # this should be the same as the "parent" country=channel.country, # this should be the same as the "parent"
influencer=channel.influencer, # this should be the same as the "parent" influencer=channel.influencer, # this should be the same as the "parent"
public=None, public=None,
chat=not chat["broadcast"], chat=not chat["broadcast"],
notes=channel.id, # this should be the channel ID of the parent notes=channel.id, # this should be the channel ID of the parent
source="linked_channel" source="linked_channel",
) )
insert(new_chat) insert(new_chat)
# TODO this method API is chaotic and could be cleaned up # TODO this method API is chaotic and could be cleaned up
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]: def transform(
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data) raw = json.loads(data.raw_data)
if raw['_'] != 'Message': if raw["_"] != "Message":
logger.warning(f"Cannot convert type {raw['_']} to post") logger.warning(f"Cannot convert type {raw['_']} to post")
return return
fwd_from = None fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']: if (
raw["fwd_from"]
and raw["fwd_from"]["from_id"]
and "channel_id" in raw["fwd_from"]["from_id"]
):
# use cache to look up channel instead of a DB request if possible # use cache to look up channel instead of a DB request if possible
if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid: if (
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first() str(raw["fwd_from"]["from_id"]["channel_id"])
not in self.channels_cache_by_platformid
):
channel = (
session.query(Channel)
.filter_by(
platform_id=str(raw["fwd_from"]["from_id"]["channel_id"]),
platform="Telegram",
)
.first()
)
if channel is None: if channel is None:
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id']) (screenname, name, notes) = self.get_screenname_from_id(
raw["fwd_from"]["from_id"]["channel_id"]
)
if name == "": if name == "":
logger.info("Trying fallback web interface") logger.info("Trying fallback web interface")
orig_channel = session.query(Channel).filter_by(id=data.channel).first() orig_channel = (
session.query(Channel).filter_by(id=data.channel).first()
)
if orig_channel.screenname is not None: if orig_channel.screenname is not None:
name = self.get_name_from_web_interface(orig_channel.screenname, raw['id']) name = self.get_name_from_web_interface(
orig_channel.screenname, raw["id"]
)
channel = Channel( channel = Channel(
name=name, name=name,
platform_id=raw['fwd_from']['from_id']['channel_id'], platform_id=raw["fwd_from"]["from_id"]["channel_id"],
platform=data.platform, platform=data.platform,
url="https://t.me/s/" + screenname if screenname is not None else "", url="https://t.me/s/" + screenname
if screenname is not None
else "",
screenname=screenname, screenname=screenname,
category='forwarded', category="forwarded",
source=self.__version__, source=self.__version__,
notes=notes notes=notes,
) )
channel = insert(channel) channel = insert(channel)
logger.info(f"Added {channel}") logger.info(f"Added {channel}")
self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])] = channel self.channels_cache_by_platformid[
str(raw["fwd_from"]["from_id"]["channel_id"])
] = channel
fwd_from = self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])].id fwd_from = self.channels_cache_by_platformid[
str(raw["fwd_from"]["from_id"]["channel_id"])
].id
reply_to = None reply_to = None
if raw['reply_to']: if raw["reply_to"]:
reply_to_id = str(raw['reply_to']['reply_to_msg_id']) reply_to_id = str(raw["reply_to"]["reply_to_msg_id"])
# use cache to find post ID instead of a DB request, if possible # use cache to find post ID instead of a DB request, if possible
if (data.channel, reply_to_id) not in self.posts_cache: if (data.channel, reply_to_id) not in self.posts_cache:
session.commit() session.commit()
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first() post = (
session.query(Post)
.filter_by(channel=data.channel, platform_id=reply_to_id)
.first()
)
if post is None: if post is None:
reply_to = -1 reply_to = -1
else: else:
@@ -238,25 +300,36 @@ class TelegramTelethonTransformer(Transformer):
mentions = [] mentions = []
for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']: for mention_entity in [
offset = mention_entity['offset'] entity
length = mention_entity['length'] for entity in raw["entities"]
if entity["_"] == "MessageEntityMention"
]:
offset = mention_entity["offset"]
length = mention_entity["length"]
screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip() screenname = (
add_surrogate(raw["message"])[offset : offset + length]
.strip("@")
.strip()
)
# use cache rather than a DB request if possible # use cache rather than a DB request if possible
if screenname.lower() not in self.channels_cache_by_screenname: if screenname.lower() not in self.channels_cache_by_screenname:
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first() channel = (
session.query(Channel)
.filter(func.lower(Channel.screenname) == func.lower(screenname))
.first()
)
if channel is None: if channel is None:
channel = Channel( channel = Channel(
name=None, name=None,
platform_id=None, platform_id=None,
platform = 'Telegram', platform="Telegram",
url="https://t.me/s/" + screenname, url="https://t.me/s/" + screenname,
screenname=screenname, screenname=screenname,
category='mentioned', category="mentioned",
source=self.__version__, source=self.__version__,
) )
@@ -277,15 +350,15 @@ class TelegramTelethonTransformer(Transformer):
channel = self.channels_cache_by_id[int(data.channel)] channel = self.channels_cache_by_id[int(data.channel)]
if channel is not None and channel.url: if channel is not None and channel.url:
url = channel.url.strip('/') + f"/{raw['id']}" url = channel.url.strip("/") + f"/{raw['id']}"
author_username = channel.screenname author_username = channel.screenname
else: else:
url = "" url = ""
author_username = "" author_username = ""
author_id = raw.get('peer_id', {}).get('channel_id') author_id = raw.get("peer_id", {}).get("channel_id")
if raw['from_id'] and 'user_id' in raw['from_id']: if raw["from_id"] and "user_id" in raw["from_id"]:
author_id = raw['from_id']['user_id'] author_id = raw["from_id"]["user_id"]
author_username = "" author_username = ""
(screenname, name, notes) = self.get_screenname_from_id(author_id) (screenname, name, notes) = self.get_screenname_from_id(author_id)
if screenname: if screenname:
@@ -293,12 +366,12 @@ class TelegramTelethonTransformer(Transformer):
transformed = Post( transformed = Post(
raw_id=data.id, raw_id=data.id,
platform_id = raw['id'], platform_id=raw["id"],
scraper=data.scraper, scraper=data.scraper,
transformer=self.__version__, transformer=self.__version__,
platform=data.platform, platform=data.platform,
channel=data.channel, channel=data.channel,
date=dateutil.parser.parse(raw['date']), date=dateutil.parser.parse(raw["date"]),
date_archived=data.date_archived, date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc), date_transformed=datetime.now(timezone.utc),
url=url, url=url,
@@ -308,47 +381,56 @@ class TelegramTelethonTransformer(Transformer):
forwarded_from=fwd_from, forwarded_from=fwd_from,
reply_to=reply_to, reply_to=reply_to,
mentions=mentions, mentions=mentions,
forwards = raw.get('forwards'), forwards=raw.get("forwards"),
views = raw.get('views') views=raw.get("views"),
) )
# insert_post # insert_post
insert_post(transformed) insert_post(transformed)
def stripped(s): def stripped(s):
"""https://stackoverflow.com/a/29933716""" """https://stackoverflow.com/a/29933716"""
lstripped = ''.join(takewhile(str.isspace, s)) lstripped = "".join(takewhile(str.isspace, s))
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s))))) rstripped = "".join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
return lstripped + rstripped return lstripped + rstripped
def add_markdown_links(raw_post): def add_markdown_links(raw_post):
"""This function is necessary because Telethon's markdown.unparse doesn't """This function is necessary because Telethon's markdown.unparse doesn't
correctly handle trailing whitespace or multi-line links""" correctly handle trailing whitespace or multi-line links"""
global_offset = 0 global_offset = 0
transformed_content = add_surrogate(raw_post['message']) transformed_content = add_surrogate(raw_post["message"])
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl'] links = [
entity
for entity in raw_post["entities"]
if entity["_"] == "MessageEntityTextUrl"
]
for link in links: for link in links:
offset = global_offset + link['offset'] offset = global_offset + link["offset"]
length = link['length'] length = link["length"]
url = link['url'] url = link["url"]
before_link = transformed_content[:offset] before_link = transformed_content[:offset]
inner_text = transformed_content[offset : offset + length] inner_text = transformed_content[offset : offset + length]
# skip creation of link if inner link text is only whitespace # skip creation of link if inner link text is only whitespace
if inner_text.replace('\u200b', '').strip(): if inner_text.replace("\u200b", "").strip():
processed_inner_text = inner_text.strip().replace("\n", "\\\n")
processed_inner_text = inner_text.strip().replace('\n', '\\\n')
link_text = f"[{processed_inner_text}]" link_text = f"[{processed_inner_text}]"
trailing_whitespace = stripped(transformed_content[offset:offset+length]) trailing_whitespace = stripped(
transformed_content[offset : offset + length]
)
link_href = f"({url})" link_href = f"({url})"
after_link = transformed_content[offset + length :] after_link = transformed_content[offset + length :]
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link transformed_content = (
global_offset += (4 + len(url) + inner_text.strip().count('\n')) before_link + link_text + link_href + trailing_whitespace + after_link
)
global_offset += 4 + len(url) + inner_text.strip().count("\n")
return del_surrogate(transformed_content) return del_surrogate(transformed_content)

View File

@@ -2,8 +2,8 @@ import requests
from loguru import logger from loguru import logger
import time import time
def make_request(url, headers = None, max_retries = 5, break_codes = None):
def make_request(url, headers=None, max_retries=5, break_codes=None):
"""Retry request `max_retries` times, while catching arbitrary exceptions. """Retry request `max_retries` times, while catching arbitrary exceptions.
Parameters Parameters
@@ -33,20 +33,17 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
try: try:
r = request_until_200( r = request_until_200(
url = url, url=url, headers=headers, max_retries=max_retries, break_codes=break_codes
headers = headers, )
max_retries = max_retries,
break_codes = break_codes)
logger.debug(f"Request for url: {url} succeeded") logger.debug(f"Request for url: {url} succeeded")
except Exception as e: except Exception as e:
logger.warning(f"Request for url: {url} raised exception: [{e}]") logger.warning(f"Request for url: {url} raised exception: [{e}]")
return r return r
def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
"""Retry request `max_retries` times, or until the request is successful. def request_until_200(url, headers=None, max_retries=5, break_codes=None):
""" """Retry request `max_retries` times, or until the request is successful."""
if break_codes is None: if break_codes is None:
break_codes = [200] break_codes = [200]
@@ -57,7 +54,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
while r.status_code not in break_codes and n_retries < 5: while r.status_code not in break_codes and n_retries < 5:
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}") logger.warning(
f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}"
)
n_retries += 1 n_retries += 1
# back off subsequent requests # back off subsequent requests
@@ -65,6 +64,8 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
r = requests.get(url, headers=headers) r = requests.get(url, headers=headers)
if r.status_code not in break_codes: if r.status_code not in break_codes:
raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts") raise ValueError(
f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts"
)
return r return r

View File

@@ -12,14 +12,15 @@
# #
import os import os
import sys import sys
sys.path.insert(0, os.path.abspath('../../'))
sys.path.insert(0, os.path.abspath("../../"))
# -- Project information ----------------------------------------------------- # -- Project information -----------------------------------------------------
project = 'Cisticola' project = "Cisticola"
copyright = '2022, Bellingcat' copyright = "2022, Bellingcat"
author = 'Bellingcat' author = "Bellingcat"
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
@@ -27,10 +28,10 @@ author = 'Bellingcat'
# Add any Sphinx extension module names here, as strings. They can be # Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones. # ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon'] extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"]
# Add any paths that contain templates here, relative to this directory. # Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates'] templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and # List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files. # directories to ignore when looking for source files.
@@ -43,7 +44,7 @@ exclude_patterns = []
# The theme to use for HTML and HTML Help pages. See the documentation for # The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes. # a list of builtin themes.
# #
html_theme = 'sphinx_rtd_theme' html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
@@ -52,9 +53,9 @@ html_static_path = []
# -- Default flags for autodoc------------------------------------------------ # -- Default flags for autodoc------------------------------------------------
autodoc_default_options = {'exclude-members': '_sa_class_manager'} autodoc_default_options = {"exclude-members": "_sa_class_manager"}
html_favicon = '../images/favicon.ico' html_favicon = "../images/favicon.ico"
html_logo = '../images/cisticola_logo.svg' html_logo = "../images/cisticola_logo.svg"
html_theme_options = {'style_nav_header_background': '#292a2b'} html_theme_options = {"style_nav_header_background": "#292a2b"}

View File

@@ -20,10 +20,12 @@ expected_headers = [
"chat", "chat",
"notes", "notes",
"normalized_url", "normalized_url",
"to_remove"] "to_remove",
]
def standardize_country(s): def standardize_country(s):
_s = s.split('(')[0].split('?')[0] _s = s.split("(")[0].split("?")[0]
return _s.strip() return _s.strip()
@@ -33,7 +35,7 @@ def sync_channels(args, session):
gc = gspread.service_account(filename="service_account.json") gc = gspread.service_account(filename="service_account.json")
# Open a sheet from a spreadsheet in one go # Open a sheet from a spreadsheet in one go
wks = gc.open_by_url(os.environ['GSHEET']).worksheet("channels") wks = gc.open_by_url(os.environ["GSHEET"]).worksheet("channels")
channels = wks.get_all_records(expected_headers=expected_headers) channels = wks.get_all_records(expected_headers=expected_headers)
row = 2 row = 2
@@ -65,22 +67,30 @@ def sync_channels(args, session):
if c["platform_id"] != "": if c["platform_id"] != "":
platform_id = c["platform_id"] platform_id = c["platform_id"]
channel = (
session.query(Channel)
.filter_by(platform_id=str(platform_id), platform=str(c["platform"]))
.first()
)
if not channel:
channel = (
session.query(Channel)
.filter_by(platform=str(c["platform"]), url=str(c["url"]))
.first()
)
if not channel and c["screenname"] != "" and c["screenname"] is not None:
channel = ( channel = (
session.query(Channel) session.query(Channel)
.filter_by( .filter_by(
platform_id=str(platform_id), platform=str(c["platform"]) platform=str(c["platform"]), screenname=str(c["screenname"])
) )
.first() .first()
) )
if not channel: if not channel:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first() if all([k in [None, True, False, ""] for k in c.values()]):
if not channel and c["screenname"] != '' and c["screenname"] is not None:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
if not channel:
if all([k in [None, True, False, ''] for k in c.values()]):
# end sync if completely empty row is encountered # end sync if completely empty row is encountered
break break
@@ -109,7 +119,11 @@ def sync_channels(args, session):
if c["screenname"]: if c["screenname"]:
channel.screenname = c["screenname"] channel.screenname = c["screenname"]
if c["country"]: if c["country"]:
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/'))) channel.country = (
None
if c["country"] is None
else list(map(standardize_country, c["country"].split("/")))
)
if c["influencer"]: if c["influencer"]:
channel.influencer = c["influencer"] channel.influencer = c["influencer"]
if c["public"]: if c["public"]:
@@ -129,23 +143,27 @@ def sync_channels(args, session):
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight # this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
if was_researcher: if was_researcher:
logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.") logger.warning(
f"This channel (ID {channel.id}) is possibly a duplicate."
)
wks.format(f"A{str(row)}:A{str(row)}", { wks.format(
"backgroundColor": { f"A{str(row)}:A{str(row)}",
"red": 1.0, {"backgroundColor": {"red": 1.0, "green": 0.0, "blue": 0.0}},
"green": 0.0, )
"blue": 0.0
}})
time.sleep(1) time.sleep(1)
# channel has ID # channel has ID
else: else:
cid = int(c["id"]) cid = int(c["id"])
channel = session.query(Channel).filter_by(id=cid).first() channel = session.query(Channel).filter_by(id=cid).first()
channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first() channel_info = (
session.query(ChannelInfo)
.filter_by(channel=cid)
.order_by(ChannelInfo.date_archived.desc())
.first()
)
logger.info(f"Updating channel {channel}") logger.info(f"Updating channel {channel}")
logger.info(f"Found info {channel_info}") logger.info(f"Found info {channel_info}")
@@ -155,7 +173,11 @@ def sync_channels(args, session):
channel.platform = c["platform"] channel.platform = c["platform"]
channel.url = c["url"] channel.url = c["url"]
channel.screenname = c["screenname"] channel.screenname = c["screenname"]
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/'))) channel.country = (
None
if c["country"] is None
else list(map(standardize_country, c["country"].split("/")))
)
channel.influencer = c["influencer"] channel.influencer = c["influencer"]
channel.public = c["public"] channel.public = c["public"]
channel.chat = c["chat"] channel.chat = c["chat"]
@@ -167,7 +189,9 @@ def sync_channels(args, session):
wks.update_cell(row, 7, channel_info.screenname) wks.update_cell(row, 7, channel_info.screenname)
time.sleep(1) time.sleep(1)
if channel_info and str(channel.platform_id) != str(channel_info.platform_id): if channel_info and str(channel.platform_id) != str(
channel_info.platform_id
):
channel.platform_id = channel_info.platform_id channel.platform_id = channel_info.platform_id
wks.update_cell(row, 3, channel_info.platform_id) wks.update_cell(row, 3, channel_info.platform_id)
time.sleep(1) time.sleep(1)

View File

@@ -8,9 +8,9 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
api_id = os.environ['TELEGRAM_API_ID'] api_id = os.environ["TELEGRAM_API_ID"]
api_hash = os.environ['TELEGRAM_API_HASH'] api_hash = os.environ["TELEGRAM_API_HASH"]
phone = os.environ['TELEGRAM_PHONE'] phone = os.environ["TELEGRAM_PHONE"]
telethon_session_name = args.telethon_session telethon_session_name = args.telethon_session
if telethon_session_name is None: if telethon_session_name is None:

View File

@@ -1,49 +1,51 @@
import pytest import pytest
from sqlalchemy.sql import text from sqlalchemy.sql import text
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo from cisticola.base import (
Post,
Channel,
ChannelInfo,
Media,
ScraperResult,
RawChannelInfo,
)
from cisticola.scraper import ( from cisticola.scraper import (
TelegramTelethonScraper, TelegramTelethonScraper,
BitchuteScraper, BitchuteScraper,
GettrScraper, GettrScraper,
RumbleScraper) RumbleScraper,
)
from cisticola.transformer import ( from cisticola.transformer import (
TelegramTelethonTransformer, TelegramTelethonTransformer,
BitchuteTransformer, BitchuteTransformer,
GettrTransformer, GettrTransformer,
RumbleTransformer) RumbleTransformer,
)
CONTROLLERS = { CONTROLLERS = {
'telegram' : { "telegram": {
'scraper': TelegramTelethonScraper, "scraper": TelegramTelethonScraper,
'transformer': TelegramTelethonTransformer "transformer": TelegramTelethonTransformer,
}, },
'bitchute': { "bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
'scraper': BitchuteScraper, "gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
'transformer': BitchuteTransformer "rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
},
'gettr': {
'scraper': GettrScraper,
'transformer': GettrTransformer
},
'rumble': {
'scraper': RumbleScraper,
'transformer': RumbleTransformer
}
} }
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble']) @pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs): def test_scraper_and_transformer(
platform, session, controller, etl_controller, channel_kwargs
):
controller.reset_db() controller.reset_db()
controller.remove_all_scrapers() controller.remove_all_scrapers()
# necessary for comments/replies to be processed correctly # necessary for comments/replies to be processed correctly
session.execute(text('INSERT INTO posts(id) VALUES (-1)')) session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
session.commit() session.commit()
channels = [Channel(**channel_kwargs[platform])] channels = [Channel(**channel_kwargs[platform])]
scraper = CONTROLLERS[platform]['scraper'] scraper = CONTROLLERS[platform]["scraper"]
controller.register_scraper(scraper=scraper()) controller.register_scraper(scraper=scraper())
controller.scrape_channels(channels=channels) controller.scrape_channels(channels=channels)
@@ -52,7 +54,11 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
raw_posts = session.query(ScraperResult).all() raw_posts = session.query(ScraperResult).all()
raw_channel_info = session.query(RawChannelInfo).all() raw_channel_info = session.query(RawChannelInfo).all()
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first() archived_urls = (
session.query(ScraperResult.archived_urls)
.order_by(ScraperResult.date_archived.desc())
.first()
)
assert len(raw_posts) > 0 assert len(raw_posts) > 0
assert len(raw_channel_info) > 0 assert len(raw_channel_info) > 0
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
controller.remove_all_scrapers() controller.remove_all_scrapers()
transformer = CONTROLLERS[platform]['transformer'] transformer = CONTROLLERS[platform]["transformer"]
etl_controller.register_transformer(transformer()) etl_controller.register_transformer(transformer())
etl_controller.transform_all_untransformed() etl_controller.transform_all_untransformed()

View File

@@ -8,161 +8,172 @@ from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController from cisticola.transformer import ETLController
BITCHUTE_CHANNEL_KWARGS = { BITCHUTE_CHANNEL_KWARGS = {
'name': 'bestonlinejewelrystoresusa@gmail.com (test)', "name": "bestonlinejewelrystoresusa@gmail.com (test)",
'platform_id': 'bestonlinejewelrystoresusagmailcom', "platform_id": "bestonlinejewelrystoresusagmailcom",
'category': 'test', "category": "test",
'platform': 'Bitchute', "platform": "Bitchute",
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/', "url": "https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/",
'screenname': None, "screenname": None,
'country': 'US', "country": "US",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
GAB_CHANNEL_KWARGS = { GAB_CHANNEL_KWARGS = {
'name': 'Capt. Marc Simon (test)', "name": "Capt. Marc Simon (test)",
'platform_id': 'marc_capt', "platform_id": "marc_capt",
'category': 'test', "category": "test",
'platform': 'Gab', "platform": "Gab",
'url': 'https://gab.com/marc_capt', "url": "https://gab.com/marc_capt",
'screenname': 'marc_capt', "screenname": "marc_capt",
'country': 'CA', "country": "CA",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
GAB_GROUP_KWARGS = { GAB_GROUP_KWARGS = {
'name': 'iran group (test)', "name": "iran group (test)",
'platform_id': "10001", "platform_id": "10001",
'category': 'test', "category": "test",
'platform': 'Gab', "platform": "Gab",
'url': 'https://gab.com/groups/10001', "url": "https://gab.com/groups/10001",
'screenname': 'iran group', "screenname": "iran group",
'country': 'IR', "country": "IR",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': True, "chat": True,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
GETTR_CHANNEL_KWARGS = { GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)', "name": "LizardRepublic (test)",
'platform_id': 'lizardrepublic', "platform_id": "lizardrepublic",
'category': 'test', "category": "test",
'platform': 'Gettr', "platform": "Gettr",
'url': 'https://www.gettr.com/user/lizardrepublic', "url": "https://www.gettr.com/user/lizardrepublic",
'screenname': 'lizardrepublic', "screenname": "lizardrepublic",
'country': 'US', "country": "US",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
INSTAGRAM_CHANNEL_KWARGS = { INSTAGRAM_CHANNEL_KWARGS = {
'name': 'borland.88 (test)', "name": "borland.88 (test)",
'platform_id': 'borland.88', "platform_id": "borland.88",
'category': 'test', "category": "test",
'platform': 'Instagram', "platform": "Instagram",
'url': 'https://www.instagram.com/borland.88/', "url": "https://www.instagram.com/borland.88/",
'screenname': 'borland.88', "screenname": "borland.88",
'country': 'UA', "country": "UA",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
ODYSEE_CHANNEL_KWARGS = { ODYSEE_CHANNEL_KWARGS = {
'name': "Mak1n' Bacon (test)", "name": "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon', "platform_id": "Mak1nBacon",
'category': 'test', "category": "test",
'platform': 'Odysee', "platform": "Odysee",
'url': 'https://odysee.com/@Mak1nBacon', "url": "https://odysee.com/@Mak1nBacon",
'screenname': 'Mak1nBacon', "screenname": "Mak1nBacon",
'country': 'US', "country": "US",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
RUMBLE_CHANNEL_KWARGS = { RUMBLE_CHANNEL_KWARGS = {
'name': 'we are uploading videos wow products (test)', "name": "we are uploading videos wow products (test)",
'platform_id': 'c-916305', "platform_id": "c-916305",
'category': 'test', "category": "test",
'platform': 'Rumble', "platform": "Rumble",
'url': 'https://rumble.com/c/c-916305', "url": "https://rumble.com/c/c-916305",
'screenname': 'we are uploading', "screenname": "we are uploading",
'country': 'CA', "country": "CA",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
TELEGRAM_CHANNEL_KWARGS = { TELEGRAM_CHANNEL_KWARGS = {
'name': 'Бутылка (test)', "name": "Бутылка (test)",
'platform_id': "-1001760492118", "platform_id": "-1001760492118",
'category': 'test', "category": "test",
'platform': 'Telegram', "platform": "Telegram",
'url': 'https://t.me/butylka1488', "url": "https://t.me/butylka1488",
'screenname': 'butylka1488', "screenname": "butylka1488",
'country': 'RU', "country": "RU",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
TWITTER_CHANNEL_KWARGS = { TWITTER_CHANNEL_KWARGS = {
'name': 'L Weber (test)', "name": "L Weber (test)",
'platform_id': "1424979017749442595", "platform_id": "1424979017749442595",
'category': 'test', "category": "test",
'platform': 'Twitter', "platform": "Twitter",
'url': 'https://twitter.com/LWeber33662141', "url": "https://twitter.com/LWeber33662141",
'screenname': 'LWeber33662141', "screenname": "LWeber33662141",
'country': 'US', "country": "US",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
VKONTAKTE_CHANNEL_KWARGS = { VKONTAKTE_CHANNEL_KWARGS = {
'name': 'Wwg1wgA (test)', "name": "Wwg1wgA (test)",
'platform_id': 'club201278078', "platform_id": "club201278078",
'category': 'test', "category": "test",
'platform': 'Vkontakte', "platform": "Vkontakte",
'url': 'https://vk.com/club201278078', "url": "https://vk.com/club201278078",
'screenname': 'Wwg1wgA', "screenname": "Wwg1wgA",
'country': 'FR', "country": "FR",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
YOUTUBE_CHANNEL_KWARGS = { YOUTUBE_CHANNEL_KWARGS = {
'name': 'AnEs87 (test)', "name": "AnEs87 (test)",
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA', "platform_id": "UCP6exBqGoxGLv_pM9Dxk2pA",
'category': 'test', "category": "test",
'platform': 'Youtube', "platform": "Youtube",
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA', "url": "https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA",
'screenname': 'AnEs87', "screenname": "AnEs87",
'country': 'SV', "country": "SV",
'influencer': None, "influencer": None,
'public': True, "public": True,
'chat': False, "chat": False,
'notes': '', "notes": "",
'source': 'researcher'} "source": "researcher",
}
@pytest.fixture(scope='package') @pytest.fixture(scope="package")
def engine(tmpdir_factory): def engine(tmpdir_factory):
"""Initialize a SQLite database and SQLAlchemy engine to be used for all """Initialize a SQLite database and SQLAlchemy engine to be used for all
tests in the package""" tests in the package"""
@@ -171,7 +182,8 @@ def engine(tmpdir_factory):
return engine return engine
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def session(engine): def session(engine):
"""Initialize a SQLAlchemy session to be used for all tests in the package""" """Initialize a SQLAlchemy session to be used for all tests in the package"""
@@ -179,7 +191,8 @@ def session(engine):
sessionfactory.configure(bind=engine) sessionfactory.configure(bind=engine)
return sessionfactory() return sessionfactory()
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def controller(engine): def controller(engine):
"""Initialize ScraperController to be used for all tests in the package.""" """Initialize ScraperController to be used for all tests in the package."""
@@ -188,7 +201,8 @@ def controller(engine):
return scraper_controller return scraper_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def etl_controller(engine): def etl_controller(engine):
"""Initialize ETLController to be used for all tests in the package.""" """Initialize ETLController to be used for all tests in the package."""
@@ -197,21 +211,23 @@ def etl_controller(engine):
return etl_controller return etl_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def channel_kwargs(): def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each """Define keyword arguments to use for defining test channels for each
platform to be scraped. platform to be scraped.
""" """
return { return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS, "bitchute": BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS, "gab": GAB_CHANNEL_KWARGS,
'gab_group' : GAB_GROUP_KWARGS, "gab_group": GAB_GROUP_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS, "gettr": GETTR_CHANNEL_KWARGS,
'instagram' : INSTAGRAM_CHANNEL_KWARGS, "instagram": INSTAGRAM_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS, "odysee": ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS, "rumble": RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS, "telegram": TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS, "twitter": TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS, "vkontakte": VKONTAKTE_CHANNEL_KWARGS,
'youtube' : YOUTUBE_CHANNEL_KWARGS} "youtube": YOUTUBE_CHANNEL_KWARGS,
}