mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue
This commit is contained in:
10
.github/workflows/black.yml
vendored
Normal file
10
.github/workflows/black.yml
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
name: Lint
|
||||||
|
|
||||||
|
on: [push]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: psf/black@stable
|
||||||
6
.pre-commit-config.yaml
Normal file
6
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
repos:
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 22.3.0
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
language_version: python3.9
|
||||||
3
Pipfile
3
Pipfile
@@ -24,6 +24,8 @@ ratelimit = "*"
|
|||||||
pytz = "*"
|
pytz = "*"
|
||||||
langdetect = "*"
|
langdetect = "*"
|
||||||
spacy = "==3.2.4"
|
spacy = "==3.2.4"
|
||||||
|
# Temporary fix for https://github.com/explosion/spaCy/issues/12659
|
||||||
|
typing_extensions = "==4.4.0"
|
||||||
ocrd-pyexiftool = "*"
|
ocrd-pyexiftool = "*"
|
||||||
filelock = "*"
|
filelock = "*"
|
||||||
telethon = "*"
|
telethon = "*"
|
||||||
@@ -38,6 +40,7 @@ pytest-metadata = "*"
|
|||||||
black = "*"
|
black = "*"
|
||||||
Sphinx = "*"
|
Sphinx = "*"
|
||||||
sphinx-rtd-theme = "*"
|
sphinx-rtd-theme = "*"
|
||||||
|
pre-commit = "*"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.9"
|
python_version = "3.9"
|
||||||
|
|||||||
2318
Pipfile.lock
generated
2318
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
80
app.py
80
app.py
@@ -10,7 +10,6 @@ import sys
|
|||||||
from cisticola.base import mapper_registry
|
from cisticola.base import mapper_registry
|
||||||
from cisticola.scraper import (
|
from cisticola.scraper import (
|
||||||
ScraperController,
|
ScraperController,
|
||||||
# VkontakteScraper,
|
|
||||||
TelegramTelethonScraper,
|
TelegramTelethonScraper,
|
||||||
GettrScraper,
|
GettrScraper,
|
||||||
BitchuteScraper,
|
BitchuteScraper,
|
||||||
@@ -22,11 +21,11 @@ from cisticola.transformer import (
|
|||||||
GettrTransformer,
|
GettrTransformer,
|
||||||
RumbleTransformer,
|
RumbleTransformer,
|
||||||
BitchuteTransformer,
|
BitchuteTransformer,
|
||||||
# VkontakteTransformer,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from sync_with_gsheet import sync_channels
|
from sync_with_gsheet import sync_channels
|
||||||
|
|
||||||
|
|
||||||
def get_db_session():
|
def get_db_session():
|
||||||
engine = create_engine(os.environ["DB"])
|
engine = create_engine(os.environ["DB"])
|
||||||
|
|
||||||
@@ -52,12 +51,14 @@ def get_scraper_controller(args):
|
|||||||
TelegramTelethonScraper(telethon_session_name=telethon_session_name),
|
TelegramTelethonScraper(telethon_session_name=telethon_session_name),
|
||||||
GettrScraper(),
|
GettrScraper(),
|
||||||
BitchuteScraper(),
|
BitchuteScraper(),
|
||||||
RumbleScraper()]
|
RumbleScraper(),
|
||||||
|
]
|
||||||
|
|
||||||
controller.register_scrapers(scrapers)
|
controller.register_scrapers(scrapers)
|
||||||
|
|
||||||
return controller
|
return controller
|
||||||
|
|
||||||
|
|
||||||
def get_transformer_controller(args):
|
def get_transformer_controller(args):
|
||||||
engine = create_engine(os.environ["DB"])
|
engine = create_engine(os.environ["DB"])
|
||||||
|
|
||||||
@@ -73,7 +74,8 @@ def get_transformer_controller(args):
|
|||||||
TelegramTelethonTransformer(telethon_session_name=telethon_session_name),
|
TelegramTelethonTransformer(telethon_session_name=telethon_session_name),
|
||||||
GettrTransformer(),
|
GettrTransformer(),
|
||||||
BitchuteTransformer(),
|
BitchuteTransformer(),
|
||||||
RumbleTransformer()]
|
RumbleTransformer(),
|
||||||
|
]
|
||||||
|
|
||||||
controller.register_transformers(transformers)
|
controller.register_transformers(transformers)
|
||||||
|
|
||||||
@@ -86,12 +88,14 @@ def scrape_channels(args):
|
|||||||
controller = get_scraper_controller(args)
|
controller = get_scraper_controller(args)
|
||||||
controller.scrape_all_channels()
|
controller.scrape_all_channels()
|
||||||
|
|
||||||
|
|
||||||
def scrape_channels_old(args):
|
def scrape_channels_old(args):
|
||||||
logger.info(f"Scraping old posts from channels")
|
logger.info(f"Scraping old posts from channels")
|
||||||
|
|
||||||
controller = get_scraper_controller(args)
|
controller = get_scraper_controller(args)
|
||||||
controller.scrape_all_channels(fetch_old=True)
|
controller.scrape_all_channels(fetch_old=True)
|
||||||
|
|
||||||
|
|
||||||
def scrape_channel_info(args):
|
def scrape_channel_info(args):
|
||||||
logger.info(f"Scraping channel info")
|
logger.info(f"Scraping channel info")
|
||||||
|
|
||||||
@@ -109,6 +113,7 @@ def archive_media(args):
|
|||||||
else:
|
else:
|
||||||
controller.archive_unarchived_media()
|
controller.archive_unarchived_media()
|
||||||
|
|
||||||
|
|
||||||
def transform(args):
|
def transform(args):
|
||||||
logger.info(f"Transforming untransformed posts")
|
logger.info(f"Transforming untransformed posts")
|
||||||
|
|
||||||
@@ -121,6 +126,7 @@ def transform(args):
|
|||||||
|
|
||||||
controller.transform_all_untransformed(min_date=min_date)
|
controller.transform_all_untransformed(min_date=min_date)
|
||||||
|
|
||||||
|
|
||||||
def transform_info(args):
|
def transform_info(args):
|
||||||
logger.info(f"Transforming untransformed channel info")
|
logger.info(f"Transforming untransformed channel info")
|
||||||
|
|
||||||
@@ -129,12 +135,14 @@ def transform_info(args):
|
|||||||
|
|
||||||
# sync_channels(args, get_db_session())
|
# sync_channels(args, get_db_session())
|
||||||
|
|
||||||
|
|
||||||
def transform_media(args):
|
def transform_media(args):
|
||||||
logger.info(f"Transforming untransformed channel media")
|
logger.info(f"Transforming untransformed channel media")
|
||||||
|
|
||||||
controller = get_transformer_controller(args)
|
controller = get_transformer_controller(args)
|
||||||
controller.transform_all_untransformed_media()
|
controller.transform_all_untransformed_media()
|
||||||
|
|
||||||
|
|
||||||
def init_db():
|
def init_db():
|
||||||
engine = create_engine(os.environ["DB"])
|
engine = create_engine(os.environ["DB"])
|
||||||
mapper_registry.metadata.create_all(bind=engine)
|
mapper_registry.metadata.create_all(bind=engine)
|
||||||
@@ -162,29 +170,77 @@ if __name__ == "__main__":
|
|||||||
if args.command == "init-db":
|
if args.command == "init-db":
|
||||||
init_db()
|
init_db()
|
||||||
elif args.command == "sync-channels":
|
elif args.command == "sync-channels":
|
||||||
logger.add("logs/sync-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/sync-channels.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
sync_channels(args, get_db_session())
|
sync_channels(args, get_db_session())
|
||||||
elif args.command == "scrape-channels":
|
elif args.command == "scrape-channels":
|
||||||
logger.add("logs/scrape-channels.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/scrape-channels.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
scrape_channels(args)
|
scrape_channels(args)
|
||||||
elif args.command == "scrape-channels-old":
|
elif args.command == "scrape-channels-old":
|
||||||
logger.add("logs/scrape-channels-old.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/scrape-channels-old.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
scrape_channels_old(args)
|
scrape_channels_old(args)
|
||||||
elif args.command == "archive-media":
|
elif args.command == "archive-media":
|
||||||
logger.add("logs/archive-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/archive-media.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
archive_media(args)
|
archive_media(args)
|
||||||
elif args.command == "channel-info":
|
elif args.command == "channel-info":
|
||||||
logger.add("logs/channel-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/channel-info.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
scrape_channel_info(args)
|
scrape_channel_info(args)
|
||||||
elif args.command == "transform":
|
elif args.command == "transform":
|
||||||
logger.add("logs/transform.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/transform.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
logger.add("logs/transform_trace.log", level="TRACE", retention="7 days")
|
logger.add("logs/transform_trace.log", level="TRACE", retention="7 days")
|
||||||
transform(args)
|
transform(args)
|
||||||
elif args.command == "transform-info":
|
elif args.command == "transform-info":
|
||||||
logger.add("logs/transform-info.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/transform-info.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
transform_info(args)
|
transform_info(args)
|
||||||
elif args.command == "transform-media":
|
elif args.command == "transform-media":
|
||||||
logger.add("logs/transform-media.log", level="DEBUG", rotation="100 MB", retention="2 weeks", compression="zip")
|
logger.add(
|
||||||
|
"logs/transform-media.log",
|
||||||
|
level="DEBUG",
|
||||||
|
rotation="100 MB",
|
||||||
|
retention="2 weeks",
|
||||||
|
compression="zip",
|
||||||
|
)
|
||||||
transform_media(args)
|
transform_media(args)
|
||||||
else:
|
else:
|
||||||
logger.error(f"Unrecognized command {args.command}")
|
logger.error(f"Unrecognized command {args.command}")
|
||||||
|
|||||||
@@ -6,7 +6,17 @@ import json
|
|||||||
import io
|
import io
|
||||||
|
|
||||||
from sqlalchemy.orm import registry
|
from sqlalchemy.orm import registry
|
||||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean, Index
|
from sqlalchemy import (
|
||||||
|
Table,
|
||||||
|
Column,
|
||||||
|
Integer,
|
||||||
|
String,
|
||||||
|
JSON,
|
||||||
|
DateTime,
|
||||||
|
ForeignKey,
|
||||||
|
Boolean,
|
||||||
|
Index,
|
||||||
|
)
|
||||||
from sqlalchemy.dialects.postgresql import JSONB
|
from sqlalchemy.dialects.postgresql import JSONB
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import PIL
|
import PIL
|
||||||
@@ -22,10 +32,10 @@ from .utils import make_request
|
|||||||
# Disable decompression bomb check
|
# Disable decompression bomb check
|
||||||
PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
|
PIL.Image.MAX_IMAGE_PIXELS = 1024 * 1024 * 256
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ScraperResult:
|
class ScraperResult:
|
||||||
"""Minimally processed set of information from a scraper about one post
|
"""Minimally processed set of information from a scraper about one post"""
|
||||||
"""
|
|
||||||
|
|
||||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||||
scraper: str
|
scraper: str
|
||||||
@@ -54,10 +64,10 @@ class ScraperResult:
|
|||||||
#: What date was the media archived? (None if not archived)
|
#: What date was the media archived? (None if not archived)
|
||||||
media_archived: datetime
|
media_archived: datetime
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Channel:
|
class Channel:
|
||||||
"""Information about a specific channel to be scraped.
|
"""Information about a specific channel to be scraped."""
|
||||||
"""
|
|
||||||
|
|
||||||
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
|
||||||
name: str
|
name: str
|
||||||
@@ -98,10 +108,10 @@ class Channel:
|
|||||||
def hydrate(self):
|
def hydrate(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class RawChannelInfo:
|
class RawChannelInfo:
|
||||||
"""Minimally processed set of information from a scraper about one channel
|
"""Minimally processed set of information from a scraper about one channel"""
|
||||||
"""
|
|
||||||
|
|
||||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||||
scraper: str
|
scraper: str
|
||||||
@@ -118,10 +128,10 @@ class RawChannelInfo:
|
|||||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||||
date_archived: datetime
|
date_archived: datetime
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ChannelInfo:
|
class ChannelInfo:
|
||||||
"""A processed set of information about a channel.
|
"""A processed set of information about a channel."""
|
||||||
"""
|
|
||||||
|
|
||||||
# Foreign key from the raw_channel_info table
|
# Foreign key from the raw_channel_info table
|
||||||
raw_channel_info_id: int
|
raw_channel_info_id: int
|
||||||
@@ -161,13 +171,15 @@ class ChannelInfo:
|
|||||||
def hydrate(self):
|
def hydrate(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
|
||||||
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
nlp_en = spacy.load("en_core_web_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
nlp_de = spacy.load("de_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
nlp_fr = spacy.load('fr_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
nlp_it = spacy.load("it_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
nlp_fr = spacy.load("fr_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
nlp_ru = spacy.load("ru_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
nlp_xx = spacy.load('xx_ent_wiki_sm')
|
nlp_nl = spacy.load("nl_core_news_sm", disable=["parser", "tok2vec", "attribute_ruler"])
|
||||||
|
nlp_xx = spacy.load("xx_ent_wiki_sm")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Post:
|
class Post:
|
||||||
@@ -258,7 +270,9 @@ class Post:
|
|||||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||||
|
|
||||||
# replace is here in order to prevent catastrophic backtracking
|
# replace is here in order to prevent catastrophic backtracking
|
||||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", "").replace("........", ""))
|
urls = re.findall(
|
||||||
|
URL_REGEX, self.content.replace("::::::::", "").replace("........", "")
|
||||||
|
)
|
||||||
self.outlinks += urls
|
self.outlinks += urls
|
||||||
self.outlinks = list(set(outlink for outlink in self.outlinks))
|
self.outlinks = list(set(outlink for outlink in self.outlinks))
|
||||||
|
|
||||||
@@ -269,10 +283,12 @@ class Post:
|
|||||||
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
|
self.hashtags = list(set(hashtag.lower() for hashtag in self.hashtags))
|
||||||
|
|
||||||
# regex patterns for finding crypto addresses
|
# regex patterns for finding crypto addresses
|
||||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
BTC_REGEX = r"\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b"
|
||||||
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
|
ETHER_REGEX = r"(0x[a-fA-F0-9]{40})"
|
||||||
|
|
||||||
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
|
self.cryptocurrency_addresses = [
|
||||||
|
m[0] for m in re.findall(BTC_REGEX, self.content)
|
||||||
|
] + re.findall(ETHER_REGEX, self.content)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.detected_language = detect(self.content)
|
self.detected_language = detect(self.content)
|
||||||
@@ -288,17 +304,17 @@ class Post:
|
|||||||
def hydrate_spacy(self):
|
def hydrate_spacy(self):
|
||||||
ner_only = False
|
ner_only = False
|
||||||
|
|
||||||
if self.detected_language == 'en':
|
if self.detected_language == "en":
|
||||||
nlp = nlp_en
|
nlp = nlp_en
|
||||||
elif self.detected_language == 'de':
|
elif self.detected_language == "de":
|
||||||
nlp = nlp_de
|
nlp = nlp_de
|
||||||
elif self.detected_language == 'it':
|
elif self.detected_language == "it":
|
||||||
nlp = nlp_it
|
nlp = nlp_it
|
||||||
elif self.detected_language == 'fr':
|
elif self.detected_language == "fr":
|
||||||
nlp = nlp_fr
|
nlp = nlp_fr
|
||||||
elif self.detected_language == 'ru':
|
elif self.detected_language == "ru":
|
||||||
nlp = nlp_ru
|
nlp = nlp_ru
|
||||||
elif self.detected_language == 'nl':
|
elif self.detected_language == "nl":
|
||||||
nlp = nlp_nl
|
nlp = nlp_nl
|
||||||
else:
|
else:
|
||||||
nlp = nlp_xx
|
nlp = nlp_xx
|
||||||
@@ -307,19 +323,36 @@ class Post:
|
|||||||
doc = nlp(self.content)
|
doc = nlp(self.content)
|
||||||
|
|
||||||
if not ner_only:
|
if not ner_only:
|
||||||
punctuation = ['?',':','!',',','.',';','|','(',')','--','#','=','+']
|
punctuation = [
|
||||||
tokens = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation]
|
"?",
|
||||||
self.normalized_content = ' '.join(tokens)
|
":",
|
||||||
|
"!",
|
||||||
|
",",
|
||||||
|
".",
|
||||||
|
";",
|
||||||
|
"|",
|
||||||
|
"(",
|
||||||
|
")",
|
||||||
|
"--",
|
||||||
|
"#",
|
||||||
|
"=",
|
||||||
|
"+",
|
||||||
|
]
|
||||||
|
tokens = [
|
||||||
|
t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation
|
||||||
|
]
|
||||||
|
self.normalized_content = " ".join(tokens)
|
||||||
else:
|
else:
|
||||||
self.normalized_content = ''
|
self.normalized_content = ""
|
||||||
|
|
||||||
self.named_entities = [{'text': ent.text, 'type': ent.label_} for ent in doc.ents]
|
self.named_entities = [
|
||||||
|
{"text": ent.text, "type": ent.label_} for ent in doc.ents
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Media:
|
class Media:
|
||||||
"""Base class for organizing information about a media file.
|
"""Base class for organizing information about a media file."""
|
||||||
"""
|
|
||||||
|
|
||||||
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
|
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
|
||||||
raw_id: int
|
raw_id: int
|
||||||
@@ -355,16 +388,14 @@ class Media:
|
|||||||
exif: str = None
|
exif: str = None
|
||||||
|
|
||||||
def get_blob(self):
|
def get_blob(self):
|
||||||
"""Download media file as bytes blob.
|
"""Download media file as bytes blob."""
|
||||||
"""
|
|
||||||
|
|
||||||
blob = make_request(self.url)
|
blob = make_request(self.url)
|
||||||
return blob.content
|
return blob.content
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def hydrate(self, blob=None):
|
def hydrate(self, blob=None):
|
||||||
"""Download media file as bytes blob and extract data from content.
|
"""Download media file as bytes blob and extract data from content."""
|
||||||
"""
|
|
||||||
|
|
||||||
if blob is None:
|
if blob is None:
|
||||||
blob = self.get_blob()
|
blob = self.get_blob()
|
||||||
@@ -372,8 +403,7 @@ class Media:
|
|||||||
self.hydrate_exif(blob)
|
self.hydrate_exif(blob)
|
||||||
|
|
||||||
def hydrate_exif(self, blob):
|
def hydrate_exif(self, blob):
|
||||||
"""Extract Exif metadata from bytes blob.
|
"""Extract Exif metadata from bytes blob."""
|
||||||
"""
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile() as temp_file:
|
with tempfile.NamedTemporaryFile() as temp_file:
|
||||||
temp_file.write(blob)
|
temp_file.write(blob)
|
||||||
@@ -382,10 +412,10 @@ class Media:
|
|||||||
exif = et.get_metadata(temp_file.name)
|
exif = et.get_metadata(temp_file.name)
|
||||||
self.exif = json.dumps(exif)
|
self.exif = json.dumps(exif)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Image(Media):
|
class Image(Media):
|
||||||
"""Class for organizing information about an image file.
|
"""Class for organizing information about an image file."""
|
||||||
"""
|
|
||||||
|
|
||||||
#: Extracted OCR content from image
|
#: Extracted OCR content from image
|
||||||
ocr: str = None
|
ocr: str = None
|
||||||
@@ -403,135 +433,152 @@ class Image(Media):
|
|||||||
self.hydrate_ocr(blob)
|
self.hydrate_ocr(blob)
|
||||||
|
|
||||||
def hydrate_ocr(self, blob):
|
def hydrate_ocr(self, blob):
|
||||||
"""Extract OCR (optical character recognition) data from image bytes blob.
|
"""Extract OCR (optical character recognition) data from image bytes blob."""
|
||||||
"""
|
|
||||||
|
|
||||||
image = PIL.Image.open(io.BytesIO(blob))
|
image = PIL.Image.open(io.BytesIO(blob))
|
||||||
self.ocr = pytesseract.image_to_string(image)
|
self.ocr = pytesseract.image_to_string(image)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Video(Media):
|
class Video(Media):
|
||||||
"""Class for organizing information about an video file.
|
"""Class for organizing information about an video file."""
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Audio(Media):
|
class Audio(Media):
|
||||||
"""Class for organizing information about an audio file.
|
"""Class for organizing information about an audio file."""
|
||||||
"""
|
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
mapper_registry = registry()
|
mapper_registry = registry()
|
||||||
|
|
||||||
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
raw_posts_table = Table(
|
||||||
Column('id', Integer, primary_key=True,
|
"raw_posts",
|
||||||
autoincrement=True),
|
mapper_registry.metadata,
|
||||||
Column('scraper', String),
|
Column("id", Integer, primary_key=True, autoincrement=True),
|
||||||
Column('platform', String),
|
Column("scraper", String),
|
||||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
Column("platform", String),
|
||||||
Column('platform_id', String, index=True),
|
Column("channel", Integer, ForeignKey("channels.id"), index=True),
|
||||||
Column('date', DateTime, index=True),
|
Column("platform_id", String, index=True),
|
||||||
Column('raw_data', String),
|
Column("date", DateTime, index=True),
|
||||||
Column('date_archived', DateTime, index=True),
|
Column("raw_data", String),
|
||||||
Column('archived_urls', JSON),
|
Column("date_archived", DateTime, index=True),
|
||||||
Column('media_archived', DateTime, index=True))
|
Column("archived_urls", JSON),
|
||||||
|
Column("media_archived", DateTime, index=True),
|
||||||
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
|
||||||
Column('id', Integer, primary_key=True),
|
|
||||||
Column('scraper', String),
|
|
||||||
Column('platform', String),
|
|
||||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
|
||||||
Column('raw_data', String),
|
|
||||||
Column('date_archived', DateTime, index=True))
|
|
||||||
|
|
||||||
channel_info_table = Table('channel_info', mapper_registry.metadata,
|
|
||||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
|
||||||
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
|
|
||||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
|
||||||
Column('platform_id', String),
|
|
||||||
Column('scraper', String),
|
|
||||||
Column('transformer', String),
|
|
||||||
Column('platform', String),
|
|
||||||
Column('screenname', String),
|
|
||||||
Column('name', String),
|
|
||||||
Column('description', String),
|
|
||||||
Column('description_url', String),
|
|
||||||
Column('description_location', String),
|
|
||||||
Column('followers', Integer),
|
|
||||||
Column('following', Integer),
|
|
||||||
Column('verified', Boolean),
|
|
||||||
Column('date_created', DateTime),
|
|
||||||
Column('date_archived', DateTime, index=True),
|
|
||||||
Column('date_transformed', DateTime, index=True),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
channel_table = Table('channels', mapper_registry.metadata,
|
raw_channel_info_table = Table(
|
||||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
"raw_channel_info",
|
||||||
Column('name', String),
|
mapper_registry.metadata,
|
||||||
Column('platform_id', String),
|
Column("id", Integer, primary_key=True),
|
||||||
Column('category', String),
|
Column("scraper", String),
|
||||||
Column('platform', String),
|
Column("platform", String),
|
||||||
Column('url', String),
|
Column("channel", Integer, ForeignKey("channels.id"), index=True),
|
||||||
Column('screenname', String),
|
Column("raw_data", String),
|
||||||
Column('country', JSONB, index = True),
|
Column("date_archived", DateTime, index=True),
|
||||||
Column('influencer', String),
|
|
||||||
Column('public', Boolean),
|
|
||||||
Column('chat', Boolean),
|
|
||||||
Column('notes', String),
|
|
||||||
Column('source', String)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
post_table = Table('posts', mapper_registry.metadata,
|
channel_info_table = Table(
|
||||||
Column('id', Integer, primary_key=True,
|
"channel_info",
|
||||||
autoincrement=True),
|
mapper_registry.metadata,
|
||||||
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
|
Column("id", Integer, primary_key=True, autoincrement=True),
|
||||||
Column('platform_id', String, index=True),
|
Column(
|
||||||
Column('scraper', String),
|
"raw_channel_info_id", Integer, ForeignKey("raw_channel_info.id"), index=True
|
||||||
Column('transformer', String),
|
),
|
||||||
Column('platform', String),
|
Column("channel", Integer, ForeignKey("channels.id"), index=True),
|
||||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
Column("platform_id", String),
|
||||||
Column('date', DateTime, index=True),
|
Column("scraper", String),
|
||||||
Column('date_archived', DateTime, index=True),
|
Column("transformer", String),
|
||||||
Column('date_transformed', DateTime, index=True),
|
Column("platform", String),
|
||||||
Column('url', String),
|
Column("screenname", String),
|
||||||
Column('author_id', String),
|
Column("name", String),
|
||||||
Column('author_username', String),
|
Column("description", String),
|
||||||
Column('content', String),
|
Column("description_url", String),
|
||||||
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
|
Column("description_location", String),
|
||||||
Column('reply_to', Integer, ForeignKey('posts.id'), index=True),
|
Column("followers", Integer),
|
||||||
Column('named_entities', JSON),
|
Column("following", Integer),
|
||||||
Column('cryptocurrency_addresses', JSON),
|
Column("verified", Boolean),
|
||||||
Column('hashtags', JSON),
|
Column("date_created", DateTime),
|
||||||
Column('outlinks', JSON),
|
Column("date_archived", DateTime, index=True),
|
||||||
Column('mentions', JSON),
|
Column("date_transformed", DateTime, index=True),
|
||||||
Column('likes', Integer),
|
|
||||||
Column('forwards', Integer),
|
|
||||||
Column('views', Integer),
|
|
||||||
Column('video_title', String),
|
|
||||||
Column('video_duration', Integer),
|
|
||||||
Column('detected_language', String, index = True),
|
|
||||||
Column('normalized_content', String)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
posts_forwarded_from_channel_index = Index('posts_channel_forwarded_from_idx', post_table.c.channel, post_table.c.forwarded_from)
|
channel_table = Table(
|
||||||
|
"channels",
|
||||||
|
mapper_registry.metadata,
|
||||||
|
Column("id", Integer, primary_key=True, autoincrement=True),
|
||||||
|
Column("name", String),
|
||||||
|
Column("platform_id", String),
|
||||||
|
Column("category", String),
|
||||||
|
Column("platform", String),
|
||||||
|
Column("url", String),
|
||||||
|
Column("screenname", String),
|
||||||
|
Column("country", JSONB, index=True),
|
||||||
|
Column("influencer", String),
|
||||||
|
Column("public", Boolean),
|
||||||
|
Column("chat", Boolean),
|
||||||
|
Column("notes", String),
|
||||||
|
Column("source", String),
|
||||||
|
)
|
||||||
|
|
||||||
media_table = Table('media', mapper_registry.metadata,
|
post_table = Table(
|
||||||
Column('id', Integer, primary_key=True,
|
"posts",
|
||||||
autoincrement=True),
|
mapper_registry.metadata,
|
||||||
Column('type', String),
|
Column("id", Integer, primary_key=True, autoincrement=True),
|
||||||
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
|
Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
|
||||||
Column('post', Integer, ForeignKey('posts.id'), index=True),
|
Column("platform_id", String, index=True),
|
||||||
Column('url', String),
|
Column("scraper", String),
|
||||||
Column('original_url', String),
|
Column("transformer", String),
|
||||||
Column('exif', String),
|
Column("platform", String),
|
||||||
Column('ocr', String),
|
Column("channel", Integer, ForeignKey("channels.id"), index=True),
|
||||||
Column('date', DateTime, index=True),
|
Column("date", DateTime, index=True),
|
||||||
Column('date_archived', DateTime, index=True),
|
Column("date_archived", DateTime, index=True),
|
||||||
Column('date_transformed', DateTime, index=True),
|
Column("date_transformed", DateTime, index=True),
|
||||||
Column('scraper', String),
|
Column("url", String),
|
||||||
Column('transformer', String)
|
Column("author_id", String),
|
||||||
|
Column("author_username", String),
|
||||||
|
Column("content", String),
|
||||||
|
Column("forwarded_from", Integer, ForeignKey("channels.id"), index=True),
|
||||||
|
Column("reply_to", Integer, ForeignKey("posts.id"), index=True),
|
||||||
|
Column("named_entities", JSON),
|
||||||
|
Column("cryptocurrency_addresses", JSON),
|
||||||
|
Column("hashtags", JSON),
|
||||||
|
Column("outlinks", JSON),
|
||||||
|
Column("mentions", JSON),
|
||||||
|
Column("likes", Integer),
|
||||||
|
Column("forwards", Integer),
|
||||||
|
Column("views", Integer),
|
||||||
|
Column("video_title", String),
|
||||||
|
Column("video_duration", Integer),
|
||||||
|
Column("detected_language", String, index=True),
|
||||||
|
Column("normalized_content", String),
|
||||||
|
)
|
||||||
|
|
||||||
|
posts_forwarded_from_channel_index = Index(
|
||||||
|
"posts_channel_forwarded_from_idx",
|
||||||
|
post_table.c.channel,
|
||||||
|
post_table.c.forwarded_from,
|
||||||
|
)
|
||||||
|
|
||||||
|
media_table = Table(
|
||||||
|
"media",
|
||||||
|
mapper_registry.metadata,
|
||||||
|
Column("id", Integer, primary_key=True, autoincrement=True),
|
||||||
|
Column("type", String),
|
||||||
|
Column("raw_id", Integer, ForeignKey("raw_posts.id"), index=True),
|
||||||
|
Column("post", Integer, ForeignKey("posts.id"), index=True),
|
||||||
|
Column("url", String),
|
||||||
|
Column("original_url", String),
|
||||||
|
Column("exif", String),
|
||||||
|
Column("ocr", String),
|
||||||
|
Column("date", DateTime, index=True),
|
||||||
|
Column("date_archived", DateTime, index=True),
|
||||||
|
Column("date_transformed", DateTime, index=True),
|
||||||
|
Column("scraper", String),
|
||||||
|
Column("transformer", String),
|
||||||
)
|
)
|
||||||
|
|
||||||
mapper_registry.map_imperatively(Post, post_table)
|
mapper_registry.map_imperatively(Post, post_table)
|
||||||
@@ -539,7 +586,27 @@ mapper_registry.map_imperatively(Channel, channel_table)
|
|||||||
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
|
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
|
||||||
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
|
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
|
||||||
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
|
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
|
||||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
mapper_registry.map_imperatively(
|
||||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
Media, media_table, polymorphic_on="type", polymorphic_identity="media"
|
||||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
)
|
||||||
mapper_registry.map_imperatively(Audio, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='audio')
|
mapper_registry.map_imperatively(
|
||||||
|
Image,
|
||||||
|
media_table,
|
||||||
|
inherits=Media,
|
||||||
|
polymorphic_on="type",
|
||||||
|
polymorphic_identity="image",
|
||||||
|
)
|
||||||
|
mapper_registry.map_imperatively(
|
||||||
|
Video,
|
||||||
|
media_table,
|
||||||
|
inherits=Media,
|
||||||
|
polymorphic_on="type",
|
||||||
|
polymorphic_identity="video",
|
||||||
|
)
|
||||||
|
mapper_registry.map_imperatively(
|
||||||
|
Audio,
|
||||||
|
media_table,
|
||||||
|
inherits=Media,
|
||||||
|
polymorphic_on="type",
|
||||||
|
polymorphic_identity="audio",
|
||||||
|
)
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from sqlalchemy import nullsfirst
|
|||||||
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
|
from cisticola.base import Channel, RawChannelInfo, ScraperResult, mapper_registry
|
||||||
from cisticola.utils import make_request
|
from cisticola.utils import make_request
|
||||||
|
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
"""Base class for defining platform-specific scrapers for scraping all posts
|
"""Base class for defining platform-specific scrapers for scraping all posts
|
||||||
from a given channel on that specific platform.
|
from a given channel on that specific platform.
|
||||||
@@ -25,23 +26,26 @@ class Scraper:
|
|||||||
|
|
||||||
__version__ = "Scraper 0.0.0"
|
__version__ = "Scraper 0.0.0"
|
||||||
|
|
||||||
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
cookiestring = (
|
||||||
cookiefilename = 'cookiefile.txt'
|
os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
|
||||||
|
)
|
||||||
|
cookiefilename = "cookiefile.txt"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
# Initialize client to transfer files to the storage archive
|
# Initialize client to transfer files to the storage archive
|
||||||
self.s3_client = boto3.client(
|
self.s3_client = boto3.client(
|
||||||
service_name='s3',
|
service_name="s3",
|
||||||
region_name=os.environ['DO_SPACES_REGION'],
|
region_name=os.environ["DO_SPACES_REGION"],
|
||||||
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
endpoint_url=f'https://{os.environ["DO_SPACES_REGION"]}.digitaloceanspaces.com',
|
||||||
aws_access_key_id=os.environ['DO_SPACES_KEY'],
|
aws_access_key_id=os.environ["DO_SPACES_KEY"],
|
||||||
aws_secret_access_key=os.environ['DO_SPACES_SECRET'])
|
aws_secret_access_key=os.environ["DO_SPACES_SECRET"],
|
||||||
|
)
|
||||||
|
|
||||||
# Define request headers (necessary to bypass scraping protection
|
# Define request headers (necessary to bypass scraping protection
|
||||||
# for several platform scrapers)
|
# for several platform scrapers)
|
||||||
self.headers = {
|
self.headers = {
|
||||||
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'}
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||||
|
}
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__version__
|
return self.__version__
|
||||||
@@ -83,7 +87,7 @@ class Scraper:
|
|||||||
the original post URL and the media's Content-Type.
|
the original post URL and the media's Content-Type.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
key = urlparse(url).path.split('/')[-1]
|
key = urlparse(url).path.split("/")[-1]
|
||||||
return key
|
return key
|
||||||
|
|
||||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||||
@@ -111,7 +115,7 @@ class Scraper:
|
|||||||
r = make_request(url, headers=self.headers)
|
r = make_request(url, headers=self.headers)
|
||||||
|
|
||||||
blob = r.content
|
blob = r.content
|
||||||
content_type = r.headers.get('Content-Type')
|
content_type = r.headers.get("Content-Type")
|
||||||
|
|
||||||
if key is None:
|
if key is None:
|
||||||
key = self.url_to_key(url, content_type)
|
key = self.url_to_key(url, content_type)
|
||||||
@@ -141,17 +145,16 @@ class Scraper:
|
|||||||
Unique identifier for the media file.
|
Unique identifier for the media file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content_type = 'video/mp4'
|
content_type = "video/mp4"
|
||||||
ext = '.' + content_type.split('/')[-1]
|
ext = "." + content_type.split("/")[-1]
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(suffix=ext) as temp_file:
|
with tempfile.NamedTemporaryFile(suffix=ext) as temp_file:
|
||||||
|
|
||||||
(
|
(
|
||||||
ffmpeg
|
ffmpeg.input(url)
|
||||||
.input(url)
|
.output(temp_file.name, vcodec="copy")
|
||||||
.output(temp_file.name, vcodec='copy')
|
.global_args("-loglevel", "error")
|
||||||
.global_args('-loglevel', 'error')
|
.run(overwrite_output=True)
|
||||||
.run(overwrite_output=True))
|
)
|
||||||
|
|
||||||
temp_file.seek(0)
|
temp_file.seek(0)
|
||||||
blob = temp_file.read()
|
blob = temp_file.read()
|
||||||
@@ -184,11 +187,11 @@ class Scraper:
|
|||||||
Unique identifier for the media file.
|
Unique identifier for the media file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content_type = 'video/mp4'
|
content_type = "video/mp4"
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
cookiefile = Path(temp_dir) / self.cookiefilename
|
cookiefile = Path(temp_dir) / self.cookiefilename
|
||||||
with open(cookiefile, 'w') as f:
|
with open(cookiefile, "w") as f:
|
||||||
f.write(self.cookiestring)
|
f.write(self.cookiestring)
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
@@ -199,14 +202,16 @@ class Scraper:
|
|||||||
"quiet": True,
|
"quiet": True,
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
"retries": 5,
|
"retries": 5,
|
||||||
"cookiefile": cookiefile}
|
"cookiefile": cookiefile,
|
||||||
|
}
|
||||||
|
|
||||||
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
ydl = yt_dlp.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
meta = ydl.extract_info(
|
meta = ydl.extract_info(
|
||||||
url,
|
url,
|
||||||
download=True,)
|
download=True,
|
||||||
|
)
|
||||||
except yt_dlp.utils.DownloadError as e:
|
except yt_dlp.utils.DownloadError as e:
|
||||||
raise e
|
raise e
|
||||||
else:
|
else:
|
||||||
@@ -240,12 +245,16 @@ class Scraper:
|
|||||||
URL specifying the file on the storage archive.
|
URL specifying the file on the storage archive.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
filename = self.__version__.replace(" ", "_") + "/" + key
|
||||||
|
|
||||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.environ[
|
self.s3_client.upload_fileobj(
|
||||||
'DO_BUCKET'], Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
|
BytesIO(blob),
|
||||||
|
Bucket=os.environ["DO_BUCKET"],
|
||||||
|
Key=filename,
|
||||||
|
ExtraArgs={"ACL": "public-read", "ContentType": content_type},
|
||||||
|
)
|
||||||
|
|
||||||
archived_url = os.environ['DO_URL'] + '/' + filename
|
archived_url = os.environ["DO_URL"] + "/" + filename
|
||||||
|
|
||||||
return archived_url
|
return archived_url
|
||||||
|
|
||||||
@@ -292,7 +301,9 @@ class Scraper:
|
|||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
def get_posts(
|
||||||
|
self, channel: Channel, since: ScraperResult = None
|
||||||
|
) -> Generator[ScraperResult, None, None]:
|
||||||
"""Scrape all posts from the specified Channel.
|
"""Scrape all posts from the specified Channel.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -342,8 +353,7 @@ class ScraperController:
|
|||||||
self.scrapers.extend(scrapers)
|
self.scrapers.extend(scrapers)
|
||||||
|
|
||||||
def remove_all_scrapers(self):
|
def remove_all_scrapers(self):
|
||||||
"""Reset the ScraperController so that it doesn't control any scrapers
|
"""Reset the ScraperController so that it doesn't control any scrapers"""
|
||||||
"""
|
|
||||||
self.scrapers = []
|
self.scrapers = []
|
||||||
|
|
||||||
def scrape_all_channels(self, fetch_old: bool = False):
|
def scrape_all_channels(self, fetch_old: bool = False):
|
||||||
@@ -362,15 +372,23 @@ class ScraperController:
|
|||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
# TODO there should be a better/more generic way of selecting scrapeable channels
|
# TODO there should be a better/more generic way of selecting scrapeable channels
|
||||||
channels = session.query(Channel).filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).all()
|
channels = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter(
|
||||||
|
(Channel.source == "researcher")
|
||||||
|
| (Channel.source == "snowball_it")
|
||||||
|
| (Channel.source == "snowball_complete")
|
||||||
|
| (Channel.source == "linked_channel")
|
||||||
|
)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
session.close()
|
session.close()
|
||||||
|
|
||||||
return self.scrape_channels(channels, fetch_old=fetch_old)
|
return self.scrape_channels(channels, fetch_old=fetch_old)
|
||||||
|
|
||||||
def scrape_all_channel_info(self):
|
def scrape_all_channel_info(self):
|
||||||
"""Scrape profile information from all channels in the database.
|
"""Scrape profile information from all channels in the database."""
|
||||||
"""
|
|
||||||
if self.session is None:
|
if self.session is None:
|
||||||
logger.error("No DB session")
|
logger.error("No DB session")
|
||||||
return
|
return
|
||||||
@@ -379,11 +397,28 @@ class ScraperController:
|
|||||||
|
|
||||||
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
|
# Because of rate limiting, we may not be able to succesfully scrape info for all of these channels.
|
||||||
# This will sort the channels by the least recently scraped.
|
# This will sort the channels by the least recently scraped.
|
||||||
most_recently_archived = session.query(func.max(RawChannelInfo.date_archived).label("date"), RawChannelInfo.channel.label("channel")).group_by(RawChannelInfo.channel).subquery()
|
most_recently_archived = (
|
||||||
channels = session.query(Channel).\
|
session.query(
|
||||||
filter((Channel.source=='researcher')|(Channel.source=='snowball_it')|(Channel.source=='snowball_complete')|(Channel.source=='linked_channel')).\
|
func.max(RawChannelInfo.date_archived).label("date"),
|
||||||
outerjoin(most_recently_archived, Channel.id == most_recently_archived.c.channel).\
|
RawChannelInfo.channel.label("channel"),
|
||||||
order_by(nullsfirst(most_recently_archived.c.date.asc())).all()
|
)
|
||||||
|
.group_by(RawChannelInfo.channel)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
channels = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter(
|
||||||
|
(Channel.source == "researcher")
|
||||||
|
| (Channel.source == "snowball_it")
|
||||||
|
| (Channel.source == "snowball_complete")
|
||||||
|
| (Channel.source == "linked_channel")
|
||||||
|
)
|
||||||
|
.outerjoin(
|
||||||
|
most_recently_archived, Channel.id == most_recently_archived.c.channel
|
||||||
|
)
|
||||||
|
.order_by(nullsfirst(most_recently_archived.c.date.asc()))
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
session.close()
|
session.close()
|
||||||
return self.scrape_channel_info(channels)
|
return self.scrape_channel_info(channels)
|
||||||
@@ -408,12 +443,17 @@ class ScraperController:
|
|||||||
|
|
||||||
# If any channels are not already in the database, add them
|
# If any channels are not already in the database, add them
|
||||||
for channel in channels:
|
for channel in channels:
|
||||||
|
|
||||||
platform_id = None
|
platform_id = None
|
||||||
if channel.platform_id not in (None, ''):
|
if channel.platform_id not in (None, ""):
|
||||||
platform_id = channel.platform_id
|
platform_id = channel.platform_id
|
||||||
|
|
||||||
channel_in_db = session.query(Channel).filter_by(platform_id=platform_id, platform=channel.platform, url=channel.url).first()
|
channel_in_db = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter_by(
|
||||||
|
platform_id=platform_id, platform=channel.platform, url=channel.url
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
if not channel_in_db:
|
if not channel_in_db:
|
||||||
logger.debug(f"{channel} does not exist in database, adding")
|
logger.debug(f"{channel} does not exist in database, adding")
|
||||||
@@ -429,13 +469,17 @@ class ScraperController:
|
|||||||
handled = True
|
handled = True
|
||||||
added = 0
|
added = 0
|
||||||
|
|
||||||
if fetch_old and channel.platform == 'Telegram':
|
if fetch_old and channel.platform == "Telegram":
|
||||||
# get oldest post (currently only for Telegram)
|
# get oldest post (currently only for Telegram)
|
||||||
# TODO fix this so that it doesn't have an explicit check on channel.platform (should be generic)
|
# TODO fix this so that it doesn't have an explicit check on channel.platform (should be generic)
|
||||||
# TODO implement until on all scrapers
|
# TODO implement until on all scrapers
|
||||||
rows = session.query(ScraperResult).where(
|
rows = (
|
||||||
ScraperResult.channel == channel.id).order_by(
|
session.query(ScraperResult)
|
||||||
ScraperResult.date.asc(), ScraperResult.id.desc()).limit(10).all()
|
.where(ScraperResult.channel == channel.id)
|
||||||
|
.order_by(ScraperResult.date.asc(), ScraperResult.id.desc())
|
||||||
|
.limit(10)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
if len(rows) > 0:
|
if len(rows) > 0:
|
||||||
until = rows[0]
|
until = rows[0]
|
||||||
@@ -449,9 +493,13 @@ class ScraperController:
|
|||||||
# Note: a "bug" in Postgres can cause this query to hang for a really long time
|
# Note: a "bug" in Postgres can cause this query to hang for a really long time
|
||||||
# when searching for a single row, hence the limit(10).all() when we really just need
|
# when searching for a single row, hence the limit(10).all() when we really just need
|
||||||
# the first row.
|
# the first row.
|
||||||
rows = session.query(ScraperResult).where(
|
rows = (
|
||||||
ScraperResult.channel == channel.id).order_by(
|
session.query(ScraperResult)
|
||||||
ScraperResult.date.desc(), ScraperResult.id.asc()).limit(10).all()
|
.where(ScraperResult.channel == channel.id)
|
||||||
|
.order_by(ScraperResult.date.desc(), ScraperResult.id.asc())
|
||||||
|
.limit(10)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
if len(rows) > 0:
|
if len(rows) > 0:
|
||||||
since = rows[0]
|
since = rows[0]
|
||||||
@@ -466,8 +514,7 @@ class ScraperController:
|
|||||||
added += 1
|
added += 1
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
logger.info(
|
logger.info(f"{scraper} found {added} new posts from {channel}")
|
||||||
f"{scraper} found {added} new posts from {channel}")
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if not handled:
|
if not handled:
|
||||||
@@ -489,11 +536,24 @@ class ScraperController:
|
|||||||
if session is None:
|
if session is None:
|
||||||
session = self.session()
|
session = self.session()
|
||||||
if chronological:
|
if chronological:
|
||||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).where(ScraperResult.id >= 0).order_by(ScraperResult.date.desc()).limit(5000).all()
|
posts = (
|
||||||
|
session.query(ScraperResult)
|
||||||
|
.where(ScraperResult.media_archived == None)
|
||||||
|
.where(ScraperResult.id >= 0)
|
||||||
|
.order_by(ScraperResult.date.desc())
|
||||||
|
.limit(5000)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
# this query is really slow (~2.5 minutes) because of the shuffle. shuffling is so that multiple media archivers could work
|
||||||
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
# simultaneously with low risk of collision (at least while the number of unarchived items is very large)
|
||||||
posts = session.query(ScraperResult).where(ScraperResult.media_archived == None).order_by(func.random()).limit(5000).all()
|
posts = (
|
||||||
|
session.query(ScraperResult)
|
||||||
|
.where(ScraperResult.media_archived == None)
|
||||||
|
.order_by(func.random())
|
||||||
|
.limit(5000)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
logger.info(f"Found {len(posts)} posts without media. Archiving now")
|
||||||
|
|
||||||
@@ -502,13 +562,23 @@ class ScraperController:
|
|||||||
|
|
||||||
for scraper in self.scrapers:
|
for scraper in self.scrapers:
|
||||||
# compare major versions
|
# compare major versions
|
||||||
if post.scraper is not None and scraper.__version__.split('.')[0] == post.scraper.split('.')[0]:
|
if (
|
||||||
|
post.scraper is not None
|
||||||
|
and scraper.__version__.split(".")[0] == post.scraper.split(".")[0]
|
||||||
|
):
|
||||||
handled = True
|
handled = True
|
||||||
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
logger.debug(f"{scraper} is archiving media for ID {post.id}")
|
||||||
post = scraper.archive_files(post)
|
post = scraper.archive_files(post)
|
||||||
|
|
||||||
if post:
|
if post:
|
||||||
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': post.media_archived})
|
session.query(ScraperResult).where(
|
||||||
|
ScraperResult.id == post.id
|
||||||
|
).update(
|
||||||
|
{
|
||||||
|
"archived_urls": post.archived_urls,
|
||||||
|
"media_archived": post.media_archived,
|
||||||
|
}
|
||||||
|
)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
break
|
break
|
||||||
@@ -535,7 +605,9 @@ class ScraperController:
|
|||||||
session = self.session()
|
session = self.session()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
|
self.archive_unarchived_media_batch(
|
||||||
|
self, session=session, chronological=chronological
|
||||||
|
)
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def scrape_channel_info(self, channels: List[Channel]):
|
def scrape_channel_info(self, channels: List[Channel]):
|
||||||
@@ -571,8 +643,7 @@ class ScraperController:
|
|||||||
session.add(info)
|
session.add(info)
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
logger.info(
|
logger.info(f"{scraper} found {info}")
|
||||||
f"{scraper} found {info}")
|
|
||||||
break
|
break
|
||||||
except ChannelDoesNotExistError:
|
except ChannelDoesNotExistError:
|
||||||
logger.warning(f"ChannelDoesNotExist {channel}")
|
logger.warning(f"ChannelDoesNotExist {channel}")
|
||||||
@@ -599,13 +670,13 @@ class ScraperController:
|
|||||||
self.session.configure(bind=self.engine)
|
self.session.configure(bind=self.engine)
|
||||||
|
|
||||||
def reset_db(self):
|
def reset_db(self):
|
||||||
"""Drop all data from the connected SQLAlchemy database.
|
"""Drop all data from the connected SQLAlchemy database."""
|
||||||
"""
|
|
||||||
|
|
||||||
close_all_sessions()
|
close_all_sessions()
|
||||||
|
|
||||||
mapper_registry.metadata.drop_all(bind=self.engine)
|
mapper_registry.metadata.drop_all(bind=self.engine)
|
||||||
self.connect_to_db(self.engine)
|
self.connect_to_db(self.engine)
|
||||||
|
|
||||||
|
|
||||||
class ChannelDoesNotExistError(Exception):
|
class ChannelDoesNotExistError(Exception):
|
||||||
"""The specified channel does not exist or has been deleted."""
|
"""The specified channel does not exist or has been deleted."""
|
||||||
|
|||||||
@@ -14,105 +14,129 @@ from loguru import logger
|
|||||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||||
from cisticola.scraper.base import Scraper
|
from cisticola.scraper.base import Scraper
|
||||||
|
|
||||||
|
|
||||||
class BitchuteScraper(Scraper):
|
class BitchuteScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||||
library"""
|
library"""
|
||||||
|
|
||||||
__version__ = "BitchuteScraper 0.0.1"
|
__version__ = "BitchuteScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(self, url):
|
def get_username_from_url(self, url):
|
||||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
username = url.split("bitchute.com/channel/")[-1].strip("/")
|
||||||
|
|
||||||
return username
|
return username
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
def get_posts(
|
||||||
|
self, channel: Channel, since: ScraperResult = None
|
||||||
|
) -> Generator[ScraperResult, None, None]:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update(self.headers)
|
session.headers.update(self.headers)
|
||||||
request = session.get("https://www.bitchute.com/search")
|
request = session.get("https://www.bitchute.com/search")
|
||||||
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
|
csrftoken = (
|
||||||
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
BeautifulSoup(request.text, "html.parser")
|
||||||
|
.findAll("input", {"name": "csrfmiddlewaretoken"})[0]
|
||||||
|
.get("value")
|
||||||
|
)
|
||||||
time.sleep(0.25)
|
time.sleep(0.25)
|
||||||
|
|
||||||
detail = 'comments'
|
detail = "comments"
|
||||||
|
|
||||||
username = self.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
|
if (
|
||||||
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
|
since is not None
|
||||||
|
and datetime.fromtimestamp(post["timestamp"]) <= since.date
|
||||||
|
):
|
||||||
break
|
break
|
||||||
|
|
||||||
archived_urls = {}
|
archived_urls = {}
|
||||||
|
|
||||||
if 'video_url' in post:
|
if "video_url" in post:
|
||||||
url = post['video_url']
|
url = post["video_url"]
|
||||||
archived_urls[url] = None
|
archived_urls[url] = None
|
||||||
|
|
||||||
yield ScraperResult(
|
yield ScraperResult(
|
||||||
scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
platform="Bitchute",
|
platform="Bitchute",
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
platform_id=post['id'],
|
platform_id=post["id"],
|
||||||
date=datetime.fromtimestamp(post['timestamp']),
|
date=datetime.fromtimestamp(post["timestamp"]),
|
||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post),
|
raw_data=json.dumps(post),
|
||||||
archived_urls=archived_urls,
|
archived_urls=archived_urls,
|
||||||
media_archived=None)
|
media_archived=None,
|
||||||
|
)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
if (
|
||||||
|
channel.platform == "Bitchute"
|
||||||
|
and self.get_username_from_url(channel.url) is not None
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||||
|
|
||||||
base_url = channel.url
|
base_url = channel.url
|
||||||
|
|
||||||
session = requests.session()
|
session = requests.session()
|
||||||
response = session.get(base_url)
|
response = session.get(base_url)
|
||||||
soup = BeautifulSoup(response.content, 'html.parser')
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
canonical_url = soup.find('link', {'id' : 'canonical'})['href']
|
canonical_url = soup.find("link", {"id": "canonical"})["href"]
|
||||||
csrftoken = session.cookies['csrftoken']
|
csrftoken = session.cookies["csrftoken"]
|
||||||
csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']
|
csrfmiddlewaretoken = soup.find("input", {"name": "csrfmiddlewaretoken"})[
|
||||||
|
"value"
|
||||||
|
]
|
||||||
|
|
||||||
about_soup = soup.find('div', {'id' : 'channel-about'})
|
about_soup = soup.find("div", {"id": "channel-about"})
|
||||||
info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p')
|
info_list = about_soup.find("div", {"class": "channel-about-details"}).find_all(
|
||||||
description_soup = about_soup.find('div', {'id' : 'channel-description'})
|
"p"
|
||||||
|
)
|
||||||
|
description_soup = about_soup.find("div", {"id": "channel-description"})
|
||||||
|
|
||||||
headers = {'Referer': base_url}
|
headers = {"Referer": base_url}
|
||||||
data = {
|
data = {"csrftoken": csrftoken, "csrfmiddlewaretoken": csrfmiddlewaretoken}
|
||||||
'csrftoken': csrftoken,
|
|
||||||
'csrfmiddlewaretoken': csrfmiddlewaretoken}
|
|
||||||
|
|
||||||
response = session.post(canonical_url + 'counts/', data = data, headers = headers)
|
response = session.post(canonical_url + "counts/", data=data, headers=headers)
|
||||||
counts = json.loads(response.text)
|
counts = json.loads(response.text)
|
||||||
|
|
||||||
owner_soup = soup.find('p', {'class' : 'owner'})
|
owner_soup = soup.find("p", {"class": "owner"})
|
||||||
if owner_soup.text == '[email\xa0protected]':
|
if owner_soup.text == "[email\xa0protected]":
|
||||||
owner_name = decode_cfemail(owner_soup.find('span', {'class': "__cf_email__"})['data-cfemail'])
|
owner_name = decode_cfemail(
|
||||||
|
owner_soup.find("span", {"class": "__cf_email__"})["data-cfemail"]
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
owner_name = owner_soup.text
|
owner_name = owner_soup.text
|
||||||
|
|
||||||
profile = {
|
profile = {
|
||||||
'description' : description_soup.text.strip(),
|
"description": description_soup.text.strip(),
|
||||||
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
|
"description_links": [
|
||||||
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
a["href"] for a in description_soup.find_all("a", href=True)
|
||||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
],
|
||||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
"created": re.sub(
|
||||||
'owner_name' : owner_name,
|
r"\s", " ", info_list[0].text.split("Created")[1].strip(". ")
|
||||||
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
|
),
|
||||||
'subscribers': counts['subscriber_count'],
|
"videos": int(info_list[1].text.split("videos")[0].strip()),
|
||||||
'views': int(counts['about_view_count'].split(' ')[0])}
|
"owner_url": soup.find("p", {"class": "owner"}).find("a", href=True)[
|
||||||
|
"href"
|
||||||
|
],
|
||||||
|
"owner_name": owner_name,
|
||||||
|
"image": about_soup.find("img", {"alt": "Channel Image"}).get("data-src"),
|
||||||
|
"subscribers": counts["subscriber_count"],
|
||||||
|
"views": int(counts["about_view_count"].split(" ")[0]),
|
||||||
|
}
|
||||||
|
|
||||||
|
return RawChannelInfo(
|
||||||
return RawChannelInfo(scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
platform=channel.platform,
|
platform=channel.platform,
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
raw_data=json.dumps(profile, default=str),
|
raw_data=json.dumps(profile, default=str),
|
||||||
date_archived=datetime.now(timezone.utc))
|
date_archived=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def strip_tags(html, convert_newlines=True):
|
def strip_tags(html, convert_newlines=True):
|
||||||
r"""
|
r"""
|
||||||
@@ -149,6 +173,7 @@ def strip_tags(html, convert_newlines=True):
|
|||||||
stripper.feed(html)
|
stripper.feed(html)
|
||||||
return stripper.get_data()
|
return stripper.get_data()
|
||||||
|
|
||||||
|
|
||||||
def request_from_bitchute(session, method, url, headers=None, data=None):
|
def request_from_bitchute(session, method, url, headers=None, data=None):
|
||||||
"""
|
"""
|
||||||
Request something via the BitChute API (or non-API)
|
Request something via the BitChute API (or non-API)
|
||||||
@@ -176,7 +201,10 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
|
|||||||
raise NotImplemented()
|
raise NotImplemented()
|
||||||
|
|
||||||
if request.status_code >= 300:
|
if request.status_code >= 300:
|
||||||
raise ValueError("Response %i from BitChute for URL %s, need to retry" % (request.status_code, url))
|
raise ValueError(
|
||||||
|
"Response %i from BitChute for URL %s, need to retry"
|
||||||
|
% (request.status_code, url)
|
||||||
|
)
|
||||||
|
|
||||||
response = request.json()
|
response = request.json()
|
||||||
return response
|
return response
|
||||||
@@ -193,6 +221,7 @@ def request_from_bitchute(session, method, url, headers=None, data=None):
|
|||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def append_details(video, detail):
|
def append_details(video, detail):
|
||||||
"""
|
"""
|
||||||
Append extra metadata to video data
|
Append extra metadata to video data
|
||||||
@@ -214,7 +243,7 @@ def append_details(video, detail):
|
|||||||
"comments": "",
|
"comments": "",
|
||||||
"hashtags": "",
|
"hashtags": "",
|
||||||
"parent_id": "",
|
"parent_id": "",
|
||||||
"video_url": ""
|
"video_url": "",
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -223,15 +252,23 @@ def append_details(video, detail):
|
|||||||
video_session = requests.session()
|
video_session = requests.session()
|
||||||
video_page = video_session.get(video["url"])
|
video_page = video_session.get(video["url"])
|
||||||
|
|
||||||
if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
|
if (
|
||||||
"<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
|
'<h1 class="page-title">Video Restricted</h1>' in video_page.text
|
||||||
"<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
|
or '<h1 class="page-title">Video Blocked</h1>' in video_page.text
|
||||||
"<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
|
or '<h1 class="page-title">Channel Blocked</h1>' in video_page.text
|
||||||
if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
|
or '<h1 class="page-title">Channel Restricted</h1>' in video_page.text
|
||||||
|
):
|
||||||
|
if (
|
||||||
|
"This video is unavailable as the contents have been deemed potentially illegal"
|
||||||
|
in video_page.text
|
||||||
|
):
|
||||||
video["category"] = "moderated-illegal"
|
video["category"] = "moderated-illegal"
|
||||||
return (video, [])
|
return (video, [])
|
||||||
|
|
||||||
elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
|
elif (
|
||||||
|
"Viewing of this video is restricted, as it has been marked as Not Safe For Life"
|
||||||
|
in video_page.text
|
||||||
|
):
|
||||||
video["category"] = "moderated-nsfl"
|
video["category"] = "moderated-nsfl"
|
||||||
return (video, [])
|
return (video, [])
|
||||||
|
|
||||||
@@ -255,39 +292,47 @@ def append_details(video, detail):
|
|||||||
video["category"] = "moderated-other"
|
video["category"] = "moderated-other"
|
||||||
return (video, [])
|
return (video, [])
|
||||||
|
|
||||||
elif "<iframe class=\"rumble\"" in video_page.text:
|
elif '<iframe class="rumble"' in video_page.text:
|
||||||
# some videos are actually embeds from rumble?
|
# some videos are actually embeds from rumble?
|
||||||
# these are iframes, so at the moment we cannot simply extract
|
# these are iframes, so at the moment we cannot simply extract
|
||||||
# their info from the page, so we skip them. In the future we
|
# their info from the page, so we skip them. In the future we
|
||||||
# could add an extra request to get the relevant info, but so
|
# could add an extra request to get the relevant info, but so
|
||||||
# far the only examples I've seen are actually 'video not found'
|
# far the only examples I've seen are actually 'video not found'
|
||||||
video = {
|
video = {**video, "category": "error-embed-from-rumble"}
|
||||||
**video,
|
|
||||||
"category": "error-embed-from-rumble"
|
|
||||||
}
|
|
||||||
return (video, [])
|
return (video, [])
|
||||||
|
|
||||||
elif video_page.status_code != 200:
|
elif video_page.status_code != 200:
|
||||||
video = {
|
video = {**video, "category": "error-%i" % video_page.status_code}
|
||||||
**video,
|
|
||||||
"category": "error-%i" % video_page.status_code
|
|
||||||
}
|
|
||||||
return (video, [])
|
return (video, [])
|
||||||
|
|
||||||
soup = BeautifulSoup(video_page.text, 'html.parser')
|
soup = BeautifulSoup(video_page.text, "html.parser")
|
||||||
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
|
||||||
|
"value"
|
||||||
|
)
|
||||||
|
|
||||||
video["video_url"] = soup.select_one("video#player source").get("src")
|
video["video_url"] = soup.select_one("video#player source").get("src")
|
||||||
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
||||||
video["subject"] = soup.select_one("h1#video-title").text
|
video["subject"] = soup.select_one("h1#video-title").text
|
||||||
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
|
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
|
||||||
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
video["author"] = (
|
||||||
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
|
soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||||
|
)
|
||||||
|
video["body"] = (
|
||||||
|
soup.select_one("div#video-description")
|
||||||
|
.encode_contents()
|
||||||
|
.decode("utf-8")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
# we need *two more requests* to get the comment count and like/dislike counts
|
# we need *two more requests* to get the comment count and like/dislike counts
|
||||||
# this seems to be because bitchute uses a third-party comment widget
|
# this seems to be because bitchute uses a third-party comment widget
|
||||||
video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
|
video_session.headers = {"Referer": video["url"], "Origin": video["url"]}
|
||||||
counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
|
counts = request_from_bitchute(
|
||||||
|
video_session,
|
||||||
|
"POST",
|
||||||
|
"https://www.bitchute.com/video/%s/counts/" % video["id"],
|
||||||
|
data={"csrfmiddlewaretoken": video_csfrtoken},
|
||||||
|
)
|
||||||
|
|
||||||
if detail == "comments":
|
if detail == "comments":
|
||||||
# if comments are also to be scraped, this is anothe request to make, which returns
|
# if comments are also to be scraped, this is anothe request to make, which returns
|
||||||
@@ -308,7 +353,12 @@ def append_details(video, detail):
|
|||||||
comment_count = 0
|
comment_count = 0
|
||||||
url = comment_script.split("'")[1]
|
url = comment_script.split("'")[1]
|
||||||
comment_csrf = comment_script.split("'")[3]
|
comment_csrf = comment_script.split("'")[3]
|
||||||
comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
|
comments_data = request_from_bitchute(
|
||||||
|
video_session,
|
||||||
|
"POST",
|
||||||
|
url + "/api/get_comments/",
|
||||||
|
data={"cf_auth": comment_csrf, "commentCount": 0},
|
||||||
|
)
|
||||||
|
|
||||||
for comment in comments_data:
|
for comment in comments_data:
|
||||||
comment_count += 1
|
comment_count += 1
|
||||||
@@ -318,14 +368,17 @@ def append_details(video, detail):
|
|||||||
else:
|
else:
|
||||||
thumbnail_image = ""
|
thumbnail_image = ""
|
||||||
|
|
||||||
comments.append({
|
comments.append(
|
||||||
|
{
|
||||||
"id": comment["id"],
|
"id": comment["id"],
|
||||||
"thread_id": video["id"],
|
"thread_id": video["id"],
|
||||||
"subject": "",
|
"subject": "",
|
||||||
"body": comment["content"],
|
"body": comment["content"],
|
||||||
"author": comment["fullname"],
|
"author": comment["fullname"],
|
||||||
"author_id": comment["creator"],
|
"author_id": comment["creator"],
|
||||||
"timestamp": int(dateparser.parse(comment["created"]).timestamp()),
|
"timestamp": int(
|
||||||
|
dateparser.parse(comment["created"]).timestamp()
|
||||||
|
),
|
||||||
"url": "",
|
"url": "",
|
||||||
"views": "",
|
"views": "",
|
||||||
"length": "",
|
"length": "",
|
||||||
@@ -336,16 +389,24 @@ def append_details(video, detail):
|
|||||||
"dislikes": "",
|
"dislikes": "",
|
||||||
"channel_subscribers": "",
|
"channel_subscribers": "",
|
||||||
"comments": "",
|
"comments": "",
|
||||||
"parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
|
"parent_id": comment.get("parent", "")
|
||||||
})
|
if "parent" in comment
|
||||||
|
else video["id"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# if we don't need the full comments, we still need another request to get the *amount*
|
# if we don't need the full comments, we still need another request to get the *amount*
|
||||||
# of comments,
|
# of comments,
|
||||||
comment_count = request_from_bitchute(video_session, "POST",
|
comment_count = request_from_bitchute(
|
||||||
|
video_session,
|
||||||
|
"POST",
|
||||||
"https://commentfreely.bitchute.com/api/get_comment_count/",
|
"https://commentfreely.bitchute.com/api/get_comment_count/",
|
||||||
data={"csrfmiddlewaretoken": video_csfrtoken,
|
data={
|
||||||
"cf_thread": "bc_" + video["id"]})["commentCount"]
|
"csrfmiddlewaretoken": video_csfrtoken,
|
||||||
|
"cf_thread": "bc_" + video["id"],
|
||||||
|
},
|
||||||
|
)["commentCount"]
|
||||||
|
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
# we wrap this in one big try-catch because doing it for each request separarely is tedious
|
# we wrap this in one big try-catch because doing it for each request separarely is tedious
|
||||||
@@ -358,7 +419,10 @@ def append_details(video, detail):
|
|||||||
# exact day it was uploaded
|
# exact day it was uploaded
|
||||||
try:
|
try:
|
||||||
published = dateparser.parse(
|
published = dateparser.parse(
|
||||||
soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
|
soup.find(class_="video-publish-date")
|
||||||
|
.text.split("published at")[1]
|
||||||
|
.strip()[:-1]
|
||||||
|
)
|
||||||
except AttributeError as e:
|
except AttributeError as e:
|
||||||
# publication date not on page?
|
# publication date not on page?
|
||||||
published = None
|
published = None
|
||||||
@@ -373,7 +437,7 @@ def append_details(video, detail):
|
|||||||
"comments": comment_count,
|
"comments": comment_count,
|
||||||
"parent_id": "",
|
"parent_id": "",
|
||||||
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
|
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
|
||||||
"views": counts["view_count"]
|
"views": counts["view_count"],
|
||||||
}
|
}
|
||||||
|
|
||||||
if published:
|
if published:
|
||||||
@@ -383,6 +447,7 @@ def append_details(video, detail):
|
|||||||
time.sleep(0.25)
|
time.sleep(0.25)
|
||||||
return (video, comments)
|
return (video, comments)
|
||||||
|
|
||||||
|
|
||||||
def get_videos_user(session, user, csrftoken, detail):
|
def get_videos_user(session, user, csrftoken, detail):
|
||||||
"""
|
"""
|
||||||
Scrape videos for given BitChute user
|
Scrape videos for given BitChute user
|
||||||
@@ -402,23 +467,27 @@ def get_videos_user(session, user, csrftoken, detail):
|
|||||||
url = base_url + "extend/"
|
url = base_url + "extend/"
|
||||||
|
|
||||||
container = session.get(base_url)
|
container = session.get(base_url)
|
||||||
container_soup = BeautifulSoup(container.text, 'html.parser')
|
container_soup = BeautifulSoup(container.text, "html.parser")
|
||||||
headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
|
headers = {"Referer": base_url, "Origin": "https://www.bitchute.com/"}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
post_data = {
|
||||||
|
"csrfmiddlewaretoken": csrftoken,
|
||||||
|
"name": "",
|
||||||
|
"offset": str(offset),
|
||||||
|
}
|
||||||
|
|
||||||
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
|
response = request_from_bitchute(
|
||||||
|
session, "POST", url, headers=headers, data=post_data
|
||||||
|
)
|
||||||
|
|
||||||
response = request_from_bitchute(session, "POST", url, headers=headers, data=post_data)
|
soup = BeautifulSoup(response["html"], "html.parser")
|
||||||
|
|
||||||
soup = BeautifulSoup(response["html"], 'html.parser')
|
|
||||||
videos = soup.select(".channel-videos-container")
|
videos = soup.select(".channel-videos-container")
|
||||||
comments = []
|
comments = []
|
||||||
|
|
||||||
if len(videos) == 0 or num_items >= max_items:
|
if len(videos) == 0 or num_items >= max_items:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
for video_element in videos:
|
for video_element in videos:
|
||||||
if num_items >= max_items:
|
if num_items >= max_items:
|
||||||
break
|
break
|
||||||
@@ -432,16 +501,26 @@ def get_videos_user(session, user, csrftoken, detail):
|
|||||||
"id": link["href"].split("/")[-2],
|
"id": link["href"].split("/")[-2],
|
||||||
"thread_id": link["href"].split("/")[-2],
|
"thread_id": link["href"].split("/")[-2],
|
||||||
"subject": link.text,
|
"subject": link.text,
|
||||||
"body": strip_tags(video_element.select_one(".channel-videos-text").text),
|
"body": strip_tags(
|
||||||
|
video_element.select_one(".channel-videos-text").text
|
||||||
|
),
|
||||||
"author": container_soup.select_one(".details .name a").text,
|
"author": container_soup.select_one(".details .name a").text,
|
||||||
"author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
|
"author_id": container_soup.select_one(".details .name a")[
|
||||||
|
"href"
|
||||||
|
].split("/")[2],
|
||||||
"timestamp": int(
|
"timestamp": int(
|
||||||
dateparser.parse(
|
dateparser.parse(
|
||||||
video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
|
video_element.select_one(
|
||||||
|
".channel-videos-details.text-right.hidden-xs"
|
||||||
|
).text
|
||||||
|
).timestamp()
|
||||||
|
),
|
||||||
"url": "https://www.bitchute.com" + link["href"],
|
"url": "https://www.bitchute.com" + link["href"],
|
||||||
"views": video_element.select_one(".video-views").text.strip(),
|
"views": video_element.select_one(".video-views").text.strip(),
|
||||||
"length": video_element.select_one(".video-duration").text.strip(),
|
"length": video_element.select_one(".video-duration").text.strip(),
|
||||||
"thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
|
"thumbnail_image": video_element.select_one(
|
||||||
|
".channel-videos-image img"
|
||||||
|
)["src"],
|
||||||
}
|
}
|
||||||
|
|
||||||
if detail != "basic":
|
if detail != "basic":
|
||||||
@@ -456,10 +535,9 @@ def get_videos_user(session, user, csrftoken, detail):
|
|||||||
# before the video, which is weird
|
# before the video, which is weird
|
||||||
yield comment
|
yield comment
|
||||||
|
|
||||||
def decode_cfemail(cfemail):
|
|
||||||
|
|
||||||
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email
|
def decode_cfemail(cfemail):
|
||||||
"""
|
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email"""
|
||||||
|
|
||||||
email = ""
|
email = ""
|
||||||
k = int(cfemail[:2], 16)
|
k = int(cfemail[:2], 16)
|
||||||
|
|||||||
@@ -9,8 +9,10 @@ from gogettr import PublicClient
|
|||||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||||
from cisticola.scraper.base import Scraper
|
from cisticola.scraper.base import Scraper
|
||||||
|
|
||||||
|
|
||||||
class GettrScraper(Scraper):
|
class GettrScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||||
|
|
||||||
__version__ = "GettrScraper 0.0.1"
|
__version__ = "GettrScraper 0.0.1"
|
||||||
|
|
||||||
def get_username_from_url(self, url):
|
def get_username_from_url(self, url):
|
||||||
@@ -21,48 +23,57 @@ class GettrScraper(Scraper):
|
|||||||
return username
|
return username
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
def get_posts(
|
||||||
|
self, channel: Channel, since: ScraperResult = None
|
||||||
|
) -> Generator[ScraperResult, None, None]:
|
||||||
client = PublicClient()
|
client = PublicClient()
|
||||||
username = self.get_username_from_url(channel.url).lower()
|
username = self.get_username_from_url(channel.url).lower()
|
||||||
scraper = client.user_activity(username=username, type="posts")
|
scraper = client.user_activity(username=username, type="posts")
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
|
if (
|
||||||
|
since is not None
|
||||||
|
and datetime.fromtimestamp(post["cdate"] * 0.001) <= since.date
|
||||||
|
):
|
||||||
break
|
break
|
||||||
|
|
||||||
archived_urls = {}
|
archived_urls = {}
|
||||||
|
|
||||||
if 'imgs' in post:
|
if "imgs" in post:
|
||||||
for img in post['imgs']:
|
for img in post["imgs"]:
|
||||||
url = "https://media.gettr.com/" + img
|
url = "https://media.gettr.com/" + img
|
||||||
archived_urls[url] = None
|
archived_urls[url] = None
|
||||||
|
|
||||||
if 'main' in post:
|
if "main" in post:
|
||||||
url = "https://media.gettr.com/" + post['main']
|
url = "https://media.gettr.com/" + post["main"]
|
||||||
archived_urls[url] = None
|
archived_urls[url] = None
|
||||||
|
|
||||||
if 'ovid' in post:
|
if "ovid" in post:
|
||||||
url = "https://media.gettr.com/" + post['ovid']
|
url = "https://media.gettr.com/" + post["ovid"]
|
||||||
archived_urls[url] = None
|
archived_urls[url] = None
|
||||||
|
|
||||||
yield ScraperResult(
|
yield ScraperResult(
|
||||||
scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
platform="Gettr",
|
platform="Gettr",
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
platform_id=post['_id'],
|
platform_id=post["_id"],
|
||||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
date=datetime.fromtimestamp(post["cdate"] / 1000.0),
|
||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post),
|
raw_data=json.dumps(post),
|
||||||
archived_urls=archived_urls,
|
archived_urls=archived_urls,
|
||||||
media_archived=None)
|
media_archived=None,
|
||||||
|
)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
if (
|
||||||
|
channel.platform == "Gettr"
|
||||||
|
and self.get_username_from_url(channel.url) is not None
|
||||||
|
):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
ext = '.' + content_type.split('/')[-1]
|
ext = "." + content_type.split("/")[-1]
|
||||||
key = urlparse(url).path.split('/')[-2] + ext
|
key = urlparse(url).path.split("/")[-2] + ext
|
||||||
return key
|
return key
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
@@ -71,8 +82,10 @@ class GettrScraper(Scraper):
|
|||||||
username = self.get_username_from_url(channel.url)
|
username = self.get_username_from_url(channel.url)
|
||||||
profile = client.user_info(username)
|
profile = client.user_info(username)
|
||||||
|
|
||||||
return RawChannelInfo(scraper=self.__version__,
|
return RawChannelInfo(
|
||||||
|
scraper=self.__version__,
|
||||||
platform=channel.platform,
|
platform=channel.platform,
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
raw_data=json.dumps(profile),
|
raw_data=json.dumps(profile),
|
||||||
date_archived=datetime.now(timezone.utc))
|
date_archived=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|||||||
@@ -10,25 +10,32 @@ import os
|
|||||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||||
from cisticola.scraper import Scraper, make_request
|
from cisticola.scraper import Scraper, make_request
|
||||||
|
|
||||||
BASE_URL = 'https://rumble.com'
|
BASE_URL = "https://rumble.com"
|
||||||
|
|
||||||
|
|
||||||
class RumbleScraper(Scraper):
|
class RumbleScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||||
|
|
||||||
__version__ = "RumbleScraper 0.0.2"
|
__version__ = "RumbleScraper 0.0.2"
|
||||||
|
|
||||||
cookiestring = os.environ["YOUTUBE_COOKIESTRING"].replace(r'\n', '\n').replace(r'\t', '\t')
|
cookiestring = (
|
||||||
cookiefilename = 'cookiefile.txt'
|
os.environ["YOUTUBE_COOKIESTRING"].replace(r"\n", "\n").replace(r"\t", "\t")
|
||||||
|
)
|
||||||
|
cookiefilename = "cookiefile.txt"
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
def get_posts(
|
||||||
|
self, channel: Channel, since: ScraperResult = None
|
||||||
|
) -> Generator[ScraperResult, None, None]:
|
||||||
scraper = get_channel_videos(channel.url)
|
scraper = get_channel_videos(channel.url)
|
||||||
|
|
||||||
for post in scraper:
|
for post in scraper:
|
||||||
if since is not None and post['datetime'].replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
if since is not None and post["datetime"].replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
) <= since.date.replace(tzinfo=timezone.utc):
|
||||||
break
|
break
|
||||||
|
|
||||||
url = post['media_url']
|
url = post["media_url"]
|
||||||
|
|
||||||
archived_urls = {url: None}
|
archived_urls = {url: None}
|
||||||
|
|
||||||
@@ -36,16 +43,17 @@ class RumbleScraper(Scraper):
|
|||||||
scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
platform="Rumble",
|
platform="Rumble",
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
platform_id=post['media_url'].split('/')[-2],
|
platform_id=post["media_url"].split("/")[-2],
|
||||||
date=post['datetime'].replace(tzinfo=timezone.utc),
|
date=post["datetime"].replace(tzinfo=timezone.utc),
|
||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post, default=str),
|
raw_data=json.dumps(post, default=str),
|
||||||
archived_urls=archived_urls,
|
archived_urls=archived_urls,
|
||||||
media_archived=None)
|
media_archived=None,
|
||||||
|
)
|
||||||
|
|
||||||
def url_to_key(self, url: str, content_type: str) -> str:
|
def url_to_key(self, url: str, content_type: str) -> str:
|
||||||
ext = '.' + content_type.split('/')[-1]
|
ext = "." + content_type.split("/")[-1]
|
||||||
key = urlparse(url).path.split('/')[-2] + ext
|
key = urlparse(url).path.split("/")[-2] + ext
|
||||||
return key
|
return key
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
@@ -65,74 +73,77 @@ class RumbleScraper(Scraper):
|
|||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||||
|
|
||||||
profile = get_channel_profile(url=channel.url)
|
profile = get_channel_profile(url=channel.url)
|
||||||
|
|
||||||
return RawChannelInfo(scraper=self.__version__,
|
return RawChannelInfo(
|
||||||
|
scraper=self.__version__,
|
||||||
platform=channel.platform,
|
platform=channel.platform,
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
raw_data=json.dumps(profile),
|
raw_data=json.dumps(profile),
|
||||||
date_archived=datetime.now(timezone.utc))
|
date_archived=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_media_url(url):
|
def get_media_url(url):
|
||||||
|
|
||||||
r = make_request(url=url)
|
r = make_request(url=url)
|
||||||
soup = BeautifulSoup(r.content, features = 'html.parser')
|
soup = BeautifulSoup(r.content, features="html.parser")
|
||||||
|
|
||||||
script = json.loads(''.join(soup.find('script', {'type':'application/ld+json'}).text))
|
script = json.loads(
|
||||||
media_url = script[0]['embedUrl']
|
"".join(soup.find("script", {"type": "application/ld+json"}).text)
|
||||||
|
)
|
||||||
|
media_url = script[0]["embedUrl"]
|
||||||
|
|
||||||
return media_url
|
return media_url
|
||||||
|
|
||||||
|
|
||||||
def process_video(video):
|
def process_video(video):
|
||||||
|
rumble_soup = video.find("span", {"class": "video-item--rumbles"})
|
||||||
rumble_soup = video.find('span', {'class' : 'video-item--rumbles'})
|
|
||||||
if rumble_soup is None:
|
if rumble_soup is None:
|
||||||
rumbles = '0'
|
rumbles = "0"
|
||||||
else:
|
else:
|
||||||
rumbles = rumble_soup['data-value']
|
rumbles = rumble_soup["data-value"]
|
||||||
|
|
||||||
view_span = video.find('span', {'class' : 'video-item--views'})
|
view_span = video.find("span", {"class": "video-item--views"})
|
||||||
if view_span is None:
|
if view_span is None:
|
||||||
views = None
|
views = None
|
||||||
else:
|
else:
|
||||||
views = view_span.get('data-value')
|
views = view_span.get("data-value")
|
||||||
|
|
||||||
author_a = video.find('a', {'rel': 'author'})
|
author_a = video.find("a", {"rel": "author"})
|
||||||
if author_a is None:
|
if author_a is None:
|
||||||
author_id = None
|
author_id = None
|
||||||
author_name = None
|
author_name = None
|
||||||
else:
|
else:
|
||||||
author_id = author_a['href'].split('/')[-1]
|
author_id = author_a["href"].split("/")[-1]
|
||||||
author_name = author_a.text
|
author_name = author_a.text
|
||||||
|
|
||||||
video_link = BASE_URL + video.find('a', href = True)['href']
|
video_link = BASE_URL + video.find("a", href=True)["href"]
|
||||||
r = make_request(url=video_link)
|
r = make_request(url=video_link)
|
||||||
soup = BeautifulSoup(r.content, features = 'html.parser')
|
soup = BeautifulSoup(r.content, features="html.parser")
|
||||||
|
|
||||||
content_div = soup.find('div', {'class': 'container content media-description'})
|
content_div = soup.find("div", {"class": "container content media-description"})
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'title' : video.find('h3').text,
|
"title": video.find("h3").text,
|
||||||
'thumbnail' : video.find('img')['src'],
|
"thumbnail": video.find("img")["src"],
|
||||||
'link' : video_link,
|
"link": video_link,
|
||||||
'views' : views,
|
"views": views,
|
||||||
'rumbles' : rumbles,
|
"rumbles": rumbles,
|
||||||
'content': '' if content_div is None else content_div.get_text('\n'),
|
"content": "" if content_div is None else content_div.get_text("\n"),
|
||||||
'duration' : video.find('span', {'class' : 'video-item--duration'})['data-value'],
|
"duration": video.find("span", {"class": "video-item--duration"})["data-value"],
|
||||||
'datetime' : datetime.fromisoformat(video.find('time')['datetime']),
|
"datetime": datetime.fromisoformat(video.find("time")["datetime"]),
|
||||||
'author_id': author_id,
|
"author_id": author_id,
|
||||||
'author_name': author_name}
|
"author_name": author_name,
|
||||||
|
}
|
||||||
|
|
||||||
info['media_url'] = get_media_url(info['link'])
|
info["media_url"] = get_media_url(info["link"])
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
def get_channel_videos(url):
|
def get_channel_videos(url):
|
||||||
|
|
||||||
page = 1
|
page = 1
|
||||||
channel_url = f'{url}?page='
|
channel_url = f"{url}?page="
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
url = channel_url + str(page)
|
url = channel_url + str(page)
|
||||||
@@ -141,37 +152,38 @@ def get_channel_videos(url):
|
|||||||
if r.status_code == 404:
|
if r.status_code == 404:
|
||||||
break
|
break
|
||||||
|
|
||||||
soup = BeautifulSoup(r.content, features = 'html.parser')
|
soup = BeautifulSoup(r.content, features="html.parser")
|
||||||
|
|
||||||
video_list = soup.find_all('li', {'class' : 'video-listing-entry'})
|
video_list = soup.find_all("li", {"class": "video-listing-entry"})
|
||||||
|
|
||||||
for video in video_list:
|
for video in video_list:
|
||||||
yield process_video(video)
|
yield process_video(video)
|
||||||
|
|
||||||
page += 1
|
page += 1
|
||||||
|
|
||||||
|
|
||||||
def get_channel_profile(url):
|
def get_channel_profile(url):
|
||||||
|
channel_url = f"{url}"
|
||||||
channel_url = f'{url}'
|
|
||||||
r = make_request(url=channel_url)
|
r = make_request(url=channel_url)
|
||||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
soup = BeautifulSoup(r.content, features="lxml")
|
||||||
|
|
||||||
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
|
verified_svg = soup.find("h1").find("svg", {"class": "listing-header--verified"})
|
||||||
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
|
thumbnail_soup = soup.find("img", {"class": "listing-header--thumb"})
|
||||||
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
|
cover_soup = soup.find("img", {"class": "listing-header--backsplash-img"})
|
||||||
|
|
||||||
author_a = soup.find('a', {'rel': 'author'})
|
author_a = soup.find("a", {"rel": "author"})
|
||||||
if author_a is None:
|
if author_a is None:
|
||||||
author_id = None
|
author_id = None
|
||||||
else:
|
else:
|
||||||
author_id = author_a['href'].split('/')[-1]
|
author_id = author_a["href"].split("/")[-1]
|
||||||
|
|
||||||
profile = {
|
profile = {
|
||||||
'name': soup.find('h1').text,
|
"name": soup.find("h1").text,
|
||||||
'id': author_id,
|
"id": author_id,
|
||||||
'verified': verified_svg is not None,
|
"verified": verified_svg is not None,
|
||||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
"thumbnail": thumbnail_soup.get("src") if thumbnail_soup else None,
|
||||||
'cover': cover_soup.get('src') if cover_soup else None,
|
"cover": cover_soup.get("src") if cover_soup else None,
|
||||||
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
|
"subscribers": soup.find("span", {"class": "subscribe-button-count"}).text,
|
||||||
|
}
|
||||||
|
|
||||||
return profile
|
return profile
|
||||||
@@ -14,19 +14,21 @@ from telethon.tl import types
|
|||||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||||
from cisticola.scraper.base import Scraper
|
from cisticola.scraper.base import Scraper
|
||||||
|
|
||||||
MEDIA_TYPES = ['photo', 'video', 'document', 'webpage']
|
MEDIA_TYPES = ["photo", "video", "document", "webpage"]
|
||||||
|
|
||||||
|
|
||||||
class TelegramTelethonScraper(Scraper):
|
class TelegramTelethonScraper(Scraper):
|
||||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||||
|
|
||||||
__version__ = "TelegramTelethonScraper 0.0.4"
|
__version__ = "TelegramTelethonScraper 0.0.4"
|
||||||
client = None
|
client = None
|
||||||
|
|
||||||
def __init__(self, telethon_session_name=None):
|
def __init__(self, telethon_session_name=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
api_id = os.environ['TELEGRAM_API_ID']
|
api_id = os.environ["TELEGRAM_API_ID"]
|
||||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
api_hash = os.environ["TELEGRAM_API_HASH"]
|
||||||
phone = os.environ['TELEGRAM_PHONE']
|
phone = os.environ["TELEGRAM_PHONE"]
|
||||||
|
|
||||||
if telethon_session_name is None:
|
if telethon_session_name is None:
|
||||||
telethon_session_name = phone
|
telethon_session_name = phone
|
||||||
@@ -40,9 +42,9 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
self.client.disconnect()
|
self.client.disconnect()
|
||||||
|
|
||||||
def get_username_from_url(url):
|
def get_username_from_url(url):
|
||||||
username = url.split('https://t.me/')[1]
|
username = url.split("https://t.me/")[1]
|
||||||
if username.startswith('s/'):
|
if username.startswith("s/"):
|
||||||
username = username.split('s/')[1]
|
username = username.split("s/")[1]
|
||||||
return username
|
return username
|
||||||
|
|
||||||
def get_channel_identifier(channel: Channel):
|
def get_channel_identifier(channel: Channel):
|
||||||
@@ -63,14 +65,18 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
if len(list(result.archived_urls.keys())) != 1:
|
if len(list(result.archived_urls.keys())) != 1:
|
||||||
logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
|
logger.warning(
|
||||||
|
f"Expected 1 key in archived_urls, found {result.archived_keys}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
key = list(result.archived_urls.keys())[0]
|
key = list(result.archived_urls.keys())[0]
|
||||||
|
|
||||||
if result.archived_urls[key] is None:
|
if result.archived_urls[key] is None:
|
||||||
raw = json.loads(result.raw_data)
|
raw = json.loads(result.raw_data)
|
||||||
|
|
||||||
message = self.client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
|
message = self.client.get_messages(
|
||||||
|
raw["peer_id"]["channel_id"], ids=[raw["id"]]
|
||||||
|
)
|
||||||
|
|
||||||
blob = None
|
blob = None
|
||||||
output_file_with_ext = None
|
output_file_with_ext = None
|
||||||
@@ -81,12 +87,16 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
|
|
||||||
if blob is not None:
|
if blob is not None:
|
||||||
# TODO specify Content-Type
|
# TODO specify Content-Type
|
||||||
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
|
archived_url = self.archive_blob(
|
||||||
|
blob=blob, content_type="", key=output_file_with_ext
|
||||||
|
)
|
||||||
result.archived_urls[key] = archived_url
|
result.archived_urls[key] = archived_url
|
||||||
result.media_archived = datetime.now(timezone.utc)
|
result.media_archived = datetime.now(timezone.utc)
|
||||||
else:
|
else:
|
||||||
if output_file_with_ext == 'largefile':
|
if output_file_with_ext == "largefile":
|
||||||
logger.info("Because this was a large file, not clearing media data")
|
logger.info(
|
||||||
|
"Because this was a large file, not clearing media data"
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
logger.warning("Downloaded blob was None")
|
logger.warning("Downloaded blob was None")
|
||||||
@@ -102,14 +112,18 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
|
|
||||||
if type(post.media) == types.MessageMediaDocument:
|
if type(post.media) == types.MessageMediaDocument:
|
||||||
if post.media.document.size / (1024 * 1024) > 50:
|
if post.media.document.size / (1024 * 1024) > 50:
|
||||||
logger.info(f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
logger.info(
|
||||||
|
f"Skipping archive of large {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
|
||||||
|
)
|
||||||
return (None, "largefile")
|
return (None, "largefile")
|
||||||
|
|
||||||
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
logger.debug(
|
||||||
|
f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Archiving {type(post.media)}")
|
logger.debug(f"Archiving {type(post.media)}")
|
||||||
|
|
||||||
key = f'{post.peer_id.channel_id}_{post.id}'
|
key = f"{post.peer_id.channel_id}_{post.id}"
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
output_file = Path(temp_dir, key)
|
output_file = Path(temp_dir, key)
|
||||||
@@ -123,7 +137,7 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
output_file_with_ext = os.listdir(temp_dir)[0]
|
output_file_with_ext = os.listdir(temp_dir)[0]
|
||||||
filename = Path(temp_dir, output_file_with_ext)
|
filename = Path(temp_dir, output_file_with_ext)
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, "rb") as f:
|
||||||
blob = f.read()
|
blob = f.read()
|
||||||
return (blob, output_file_with_ext)
|
return (blob, output_file_with_ext)
|
||||||
|
|
||||||
@@ -132,22 +146,35 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
# @logger.catch
|
# @logger.catch
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
def get_posts(
|
||||||
|
self, channel: Channel, since: ScraperResult = None, until: ScraperResult = None
|
||||||
|
) -> Generator[ScraperResult, None, None]:
|
||||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||||
if until is not None:
|
if until is not None:
|
||||||
logger.info(f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}")
|
logger.info(
|
||||||
iterator = self.client.iter_messages(username, max_id=int(until.platform_id.split('/')[-1]), wait_time=0, limit=None)
|
f"Only getting old posts, up to ID {until.platform_id.split('/')[-1]}"
|
||||||
|
)
|
||||||
|
iterator = self.client.iter_messages(
|
||||||
|
username,
|
||||||
|
max_id=int(until.platform_id.split("/")[-1]),
|
||||||
|
wait_time=0,
|
||||||
|
limit=None,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
iterator = self.client.iter_messages(username)
|
iterator = self.client.iter_messages(username)
|
||||||
|
|
||||||
post = None
|
post = None
|
||||||
for post in iterator:
|
for post in iterator:
|
||||||
post_url = f'{channel.url}/{post.id}'
|
post_url = f"{channel.url}/{post.id}"
|
||||||
|
|
||||||
logger.trace(f"Archiving post {post_url} from {post.date}")
|
logger.trace(f"Archiving post {post_url} from {post.date}")
|
||||||
|
|
||||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
if since is not None and post.date.replace(
|
||||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
tzinfo=timezone.utc
|
||||||
|
) <= since.date.replace(tzinfo=timezone.utc):
|
||||||
|
logger.info(
|
||||||
|
f"Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}"
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
archived_urls = {}
|
archived_urls = {}
|
||||||
@@ -166,10 +193,18 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post.to_dict(), default=str),
|
raw_data=json.dumps(post.to_dict(), default=str),
|
||||||
archived_urls=archived_urls,
|
archived_urls=archived_urls,
|
||||||
media_archived=media_archived)
|
media_archived=media_archived,
|
||||||
|
)
|
||||||
|
|
||||||
if (post is not None and post.id > 1 and since is None) or (post is not None and since is not None and post.date.replace(tzinfo=timezone.utc) > since.date.replace(tzinfo=timezone.utc)):
|
if (post is not None and post.id > 1 and since is None) or (
|
||||||
logger.info(f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again")
|
post is not None
|
||||||
|
and since is not None
|
||||||
|
and post.date.replace(tzinfo=timezone.utc)
|
||||||
|
> since.date.replace(tzinfo=timezone.utc)
|
||||||
|
):
|
||||||
|
logger.info(
|
||||||
|
f"Last post ID is {post.id} / {post.date}, since is {since.date if since is not None else None}, until is {until.platform_id if until is not None else None}, starting again"
|
||||||
|
)
|
||||||
new_until = ScraperResult(
|
new_until = ScraperResult(
|
||||||
scraper=self.__version__,
|
scraper=self.__version__,
|
||||||
platform="Telegram",
|
platform="Telegram",
|
||||||
@@ -179,19 +214,21 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
date_archived=datetime.now(timezone.utc),
|
date_archived=datetime.now(timezone.utc),
|
||||||
raw_data=json.dumps(post.to_dict(), default=str),
|
raw_data=json.dumps(post.to_dict(), default=str),
|
||||||
archived_urls=archived_urls,
|
archived_urls=archived_urls,
|
||||||
media_archived=media_archived)
|
media_archived=media_archived,
|
||||||
|
)
|
||||||
for p in self.get_posts(channel, since=since, until=new_until):
|
for p in self.get_posts(channel, since=since, until=new_until):
|
||||||
yield p
|
yield p
|
||||||
|
|
||||||
|
|
||||||
@logger.catch
|
@logger.catch
|
||||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||||
full_channel = self.client(GetFullChannelRequest(channel=username))
|
full_channel = self.client(GetFullChannelRequest(channel=username))
|
||||||
profile = full_channel.to_dict()
|
profile = full_channel.to_dict()
|
||||||
|
|
||||||
return RawChannelInfo(scraper=self.__version__,
|
return RawChannelInfo(
|
||||||
|
scraper=self.__version__,
|
||||||
platform=channel.platform,
|
platform=channel.platform,
|
||||||
channel=channel.id,
|
channel=channel.id,
|
||||||
raw_data=json.dumps(profile, default=str),
|
raw_data=json.dumps(profile, default=str),
|
||||||
date_archived=datetime.now(timezone.utc))
|
date_archived=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|||||||
@@ -7,7 +7,18 @@ from sqlalchemy.sql.expression import func
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry, Image, Video, Audio
|
from cisticola.base import (
|
||||||
|
RawChannelInfo,
|
||||||
|
ChannelInfo,
|
||||||
|
ScraperResult,
|
||||||
|
Post,
|
||||||
|
Media,
|
||||||
|
Channel,
|
||||||
|
mapper_registry,
|
||||||
|
Image,
|
||||||
|
Video,
|
||||||
|
Audio,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Transformer:
|
class Transformer:
|
||||||
@@ -35,7 +46,9 @@ class Transformer:
|
|||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def transform(data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform(
|
||||||
|
data: ScraperResult, insert: Callable
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
"""Transform a ScraperResult into objects with additional parameters for analysis. This function can
|
||||||
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
|
yield multiple objects, as it will find references to quoted/replied posts, media objects, and Channel
|
||||||
objects and provide all of these to be inserted into the database.
|
objects and provide all of these to be inserted into the database.
|
||||||
@@ -67,16 +80,27 @@ class Transformer:
|
|||||||
for k in data.archived_urls:
|
for k in data.archived_urls:
|
||||||
if data.archived_urls[k]:
|
if data.archived_urls[k]:
|
||||||
archived_url = data.archived_urls[k]
|
archived_url = data.archived_urls[k]
|
||||||
filename = archived_url.split('/')[-1]
|
filename = archived_url.split("/")[-1]
|
||||||
ext = None if '.' not in filename else filename.split('.')[-1].lower()
|
ext = None if "." not in filename else filename.split(".")[-1].lower()
|
||||||
|
|
||||||
media_kwargs = dict(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
|
media_kwargs = dict(
|
||||||
|
url=archived_url,
|
||||||
|
post=transformed.id,
|
||||||
|
raw_id=data.id,
|
||||||
|
original_url=k,
|
||||||
|
date=data.date,
|
||||||
|
date_archived=data.date_archived,
|
||||||
|
date_transformed=datetime.now(timezone.utc),
|
||||||
|
transformer=self.__version__,
|
||||||
|
scraper=data.scraper,
|
||||||
|
platform=data.platform,
|
||||||
|
)
|
||||||
|
|
||||||
if ext in ('mp4', 'mov', 'avi', 'mkv'):
|
if ext in ("mp4", "mov", "avi", "mkv"):
|
||||||
media_class = Video
|
media_class = Video
|
||||||
elif ext in ('oga', 'mp3', "wav", 'aif', 'aiff', 'aac'):
|
elif ext in ("oga", "mp3", "wav", "aif", "aiff", "aac"):
|
||||||
media_class = Audio
|
media_class = Audio
|
||||||
elif ext in ('jpg', 'jpeg', 'png', 'gif', 'bmp', 'heic', 'tiff'):
|
elif ext in ("jpg", "jpeg", "png", "gif", "bmp", "heic", "tiff"):
|
||||||
media_class = Image
|
media_class = Image
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Unknown file extension {ext}")
|
logger.warning(f"Unknown file extension {ext}")
|
||||||
@@ -202,11 +226,31 @@ class ETLController:
|
|||||||
|
|
||||||
# This is using some adhoc unique constraints that might be worth formalizing at some point
|
# This is using some adhoc unique constraints that might be worth formalizing at some point
|
||||||
if type(obj) == Channel:
|
if type(obj) == Channel:
|
||||||
instance = session.query(Channel).filter(
|
instance = (
|
||||||
(((Channel.url==obj.url)&(Channel.url!='')&(Channel.url is not None)&(Channel.url!='https://t.me/s/'))|
|
session.query(Channel)
|
||||||
((Channel.platform_id==str(obj.platform_id))&(Channel.platform_id!='')&(Channel.platform_id is not None))|
|
.filter(
|
||||||
((Channel.screenname==obj.screenname)&(Channel.screenname!='')&(Channel.screenname is not None)))&
|
(
|
||||||
(Channel.platform==obj.platform)).first()
|
(
|
||||||
|
(Channel.url == obj.url)
|
||||||
|
& (Channel.url != "")
|
||||||
|
& (Channel.url is not None)
|
||||||
|
& (Channel.url != "https://t.me/s/")
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
(Channel.platform_id == str(obj.platform_id))
|
||||||
|
& (Channel.platform_id != "")
|
||||||
|
& (Channel.platform_id is not None)
|
||||||
|
)
|
||||||
|
| (
|
||||||
|
(Channel.screenname == obj.screenname)
|
||||||
|
& (Channel.screenname != "")
|
||||||
|
& (Channel.screenname is not None)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
& (Channel.platform == obj.platform)
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
elif type(obj) == Post:
|
elif type(obj) == Post:
|
||||||
return self.insert_post(obj, session, hydrate)
|
return self.insert_post(obj, session, hydrate)
|
||||||
@@ -240,7 +284,12 @@ class ETLController:
|
|||||||
logger.info(f"Found matching DB entry for {obj}: {instance}")
|
logger.info(f"Found matching DB entry for {obj}: {instance}")
|
||||||
|
|
||||||
if type(obj) == Channel:
|
if type(obj) == Channel:
|
||||||
if obj.source != instance.source and obj.source == 'linked_channel' and instance.source != 'researcher' and (instance.source is None or instance.source[:4] != 'snow'):
|
if (
|
||||||
|
obj.source != instance.source
|
||||||
|
and obj.source == "linked_channel"
|
||||||
|
and instance.source != "researcher"
|
||||||
|
and (instance.source is None or instance.source[:4] != "snow")
|
||||||
|
):
|
||||||
logger.info(f"Updating source to linked channel")
|
logger.info(f"Updating source to linked channel")
|
||||||
instance.source = obj.source
|
instance.source = obj.source
|
||||||
instance.notes = obj.notes
|
instance.notes = obj.notes
|
||||||
@@ -251,7 +300,7 @@ class ETLController:
|
|||||||
session.flush()
|
session.flush()
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
if (instance.platform_id is None or instance.platform_id == ''):
|
if instance.platform_id is None or instance.platform_id == "":
|
||||||
instance.platform_id = obj.platform_id
|
instance.platform_id = obj.platform_id
|
||||||
session.flush()
|
session.flush()
|
||||||
session.commit()
|
session.commit()
|
||||||
@@ -293,22 +342,35 @@ class ETLController:
|
|||||||
handled = False
|
handled = False
|
||||||
|
|
||||||
if transformer.can_handle(result):
|
if transformer.can_handle(result):
|
||||||
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
|
logger.trace(
|
||||||
|
f"{transformer} is handling result {result.id} ({result.date})"
|
||||||
|
)
|
||||||
handled = True
|
handled = True
|
||||||
|
|
||||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session, lambda obj: self.insert_post(obj, session, hydrate, flush=False), lambda: self.flush_posts(session))
|
transformer.transform(
|
||||||
|
result,
|
||||||
|
lambda obj: self.insert_or_select(obj, session, hydrate),
|
||||||
|
session,
|
||||||
|
lambda obj: self.insert_post(
|
||||||
|
obj, session, hydrate, flush=False
|
||||||
|
),
|
||||||
|
lambda: self.flush_posts(session),
|
||||||
|
)
|
||||||
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if handled == False:
|
if handled == False:
|
||||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
logger.warning(
|
||||||
|
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
|
||||||
|
)
|
||||||
|
|
||||||
self.flush_posts(session)
|
self.flush_posts(session)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def transform_all_untransformed(self, hydrate: bool = True, min_date=datetime(2010, 1, 1)):
|
def transform_all_untransformed(
|
||||||
|
self, hydrate: bool = True, min_date=datetime(2010, 1, 1)
|
||||||
|
):
|
||||||
"""Transform all ScraperResult objects in the database that do not have an
|
"""Transform all ScraperResult objects in the database that do not have an
|
||||||
equivalent Post object stored.
|
equivalent Post object stored.
|
||||||
|
|
||||||
@@ -331,7 +393,8 @@ class ETLController:
|
|||||||
|
|
||||||
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
|
logger.info(f"Fetching first untransformed post batch of {BATCH_SIZE}")
|
||||||
|
|
||||||
batch = (session.query(ScraperResult)
|
batch = (
|
||||||
|
session.query(ScraperResult)
|
||||||
.join(Post, isouter=True)
|
.join(Post, isouter=True)
|
||||||
.where(ScraperResult.date > min_date)
|
.where(ScraperResult.date > min_date)
|
||||||
.where(Post.raw_id == None)
|
.where(Post.raw_id == None)
|
||||||
@@ -344,9 +407,12 @@ class ETLController:
|
|||||||
|
|
||||||
self.transform_results(batch, hydrate=hydrate)
|
self.transform_results(batch, hydrate=hydrate)
|
||||||
|
|
||||||
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}")
|
logger.info(
|
||||||
|
f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {max(batch, key=lambda v: v.date).date}"
|
||||||
|
)
|
||||||
|
|
||||||
batch = (session.query(ScraperResult)
|
batch = (
|
||||||
|
session.query(ScraperResult)
|
||||||
.join(Post, isouter=True)
|
.join(Post, isouter=True)
|
||||||
.where(ScraperResult.date > min_date)
|
.where(ScraperResult.date > min_date)
|
||||||
.where(Post.raw_id == None)
|
.where(Post.raw_id == None)
|
||||||
@@ -356,7 +422,6 @@ class ETLController:
|
|||||||
.limit(BATCH_SIZE)
|
.limit(BATCH_SIZE)
|
||||||
).all()
|
).all()
|
||||||
|
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def transform_info(self, results: List[ChannelInfo]):
|
def transform_info(self, results: List[ChannelInfo]):
|
||||||
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
|
"""Transform raw RawChannelInfo objects into ChannelInfo objects.
|
||||||
@@ -380,17 +445,25 @@ class ETLController:
|
|||||||
|
|
||||||
for transformer in self.transformers:
|
for transformer in self.transformers:
|
||||||
if transformer.can_handle(result):
|
if transformer.can_handle(result):
|
||||||
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
|
logger.trace(
|
||||||
|
f"{transformer} is handling raw info result {result.id} ({result.date_archived})"
|
||||||
|
)
|
||||||
handled = True
|
handled = True
|
||||||
|
|
||||||
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session, channel=data.Channel)
|
transformer.transform_info(
|
||||||
|
result,
|
||||||
|
lambda obj: self.insert_or_select(obj, session, False),
|
||||||
|
session,
|
||||||
|
channel=data.Channel,
|
||||||
|
)
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
break
|
break
|
||||||
|
|
||||||
if handled == False:
|
if handled == False:
|
||||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
logger.warning(
|
||||||
|
f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})"
|
||||||
|
)
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def transform_all_untransformed_info(self):
|
def transform_all_untransformed_info(self):
|
||||||
@@ -407,7 +480,8 @@ class ETLController:
|
|||||||
offset = 0
|
offset = 0
|
||||||
batch = []
|
batch = []
|
||||||
|
|
||||||
query = (session.query(RawChannelInfo, Channel)
|
query = (
|
||||||
|
session.query(RawChannelInfo, Channel)
|
||||||
.select_from(RawChannelInfo)
|
.select_from(RawChannelInfo)
|
||||||
.join(ChannelInfo, isouter=True)
|
.join(ChannelInfo, isouter=True)
|
||||||
.join(Channel, RawChannelInfo.channel == Channel.id)
|
.join(Channel, RawChannelInfo.channel == Channel.id)
|
||||||
@@ -416,12 +490,16 @@ class ETLController:
|
|||||||
)
|
)
|
||||||
|
|
||||||
while len(batch) > 0 or offset == 0:
|
while len(batch) > 0 or offset == 0:
|
||||||
logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}")
|
logger.info(
|
||||||
|
f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}"
|
||||||
|
)
|
||||||
|
|
||||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
||||||
offset += BATCH_SIZE
|
offset += BATCH_SIZE
|
||||||
|
|
||||||
logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)")
|
logger.info(
|
||||||
|
f"Found {len(batch)} info items to ETL ({offset} already processed)"
|
||||||
|
)
|
||||||
|
|
||||||
self.transform_info(batch)
|
self.transform_info(batch)
|
||||||
|
|
||||||
@@ -450,16 +528,24 @@ class ETLController:
|
|||||||
handled = False
|
handled = False
|
||||||
|
|
||||||
if transformer.can_handle(result):
|
if transformer.can_handle(result):
|
||||||
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
|
logger.trace(
|
||||||
|
f"{transformer} is handling result {result.id} ({result.date})"
|
||||||
|
)
|
||||||
handled = True
|
handled = True
|
||||||
|
|
||||||
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate))
|
transformer.transform_media(
|
||||||
|
result,
|
||||||
|
total_result.Post,
|
||||||
|
lambda obj: self.insert_or_select(obj, session, hydrate),
|
||||||
|
)
|
||||||
|
|
||||||
session.commit()
|
session.commit()
|
||||||
break
|
break
|
||||||
|
|
||||||
if handled == False:
|
if handled == False:
|
||||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
logger.warning(
|
||||||
|
f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})"
|
||||||
|
)
|
||||||
|
|
||||||
@logger.catch(reraise=True)
|
@logger.catch(reraise=True)
|
||||||
def transform_all_untransformed_media(self, hydrate=True):
|
def transform_all_untransformed_media(self, hydrate=True):
|
||||||
@@ -482,10 +568,15 @@ class ETLController:
|
|||||||
|
|
||||||
logger.info(f"Fetching first untransformed post media batch of {BATCH_SIZE}")
|
logger.info(f"Fetching first untransformed post media batch of {BATCH_SIZE}")
|
||||||
|
|
||||||
batch = (session.query(ScraperResult, Post)
|
batch = (
|
||||||
|
session.query(ScraperResult, Post)
|
||||||
.join(Post)
|
.join(Post)
|
||||||
.join(Media, isouter=True)
|
.join(Media, isouter=True)
|
||||||
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
|
.filter(
|
||||||
|
(ScraperResult.media_archived != None)
|
||||||
|
& (cast(ScraperResult.archived_urls, String) != "{}")
|
||||||
|
& (Media.id == None)
|
||||||
|
)
|
||||||
.order_by(ScraperResult.date.desc())
|
.order_by(ScraperResult.date.desc())
|
||||||
.limit(BATCH_SIZE)
|
.limit(BATCH_SIZE)
|
||||||
).all()
|
).all()
|
||||||
@@ -495,13 +586,23 @@ class ETLController:
|
|||||||
|
|
||||||
self.transform_media(batch, hydrate=hydrate)
|
self.transform_media(batch, hydrate=hydrate)
|
||||||
|
|
||||||
logger.info(f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}")
|
logger.info(
|
||||||
|
f"Fetching untransformed post media batch of {BATCH_SIZE}, offset {min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date}"
|
||||||
|
)
|
||||||
|
|
||||||
batch = (session.query(ScraperResult, Post)
|
batch = (
|
||||||
|
session.query(ScraperResult, Post)
|
||||||
.join(Post)
|
.join(Post)
|
||||||
.join(Media, isouter=True)
|
.join(Media, isouter=True)
|
||||||
.where(ScraperResult.date <= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date)
|
.where(
|
||||||
.filter((ScraperResult.media_archived != None) & (cast(ScraperResult.archived_urls, String) != '{}') & (Media.id == None))
|
ScraperResult.date
|
||||||
|
<= min(batch, key=lambda v: v.ScraperResult.date).ScraperResult.date
|
||||||
|
)
|
||||||
|
.filter(
|
||||||
|
(ScraperResult.media_archived != None)
|
||||||
|
& (cast(ScraperResult.archived_urls, String) != "{}")
|
||||||
|
& (Media.id == None)
|
||||||
|
)
|
||||||
.order_by(ScraperResult.date.desc())
|
.order_by(ScraperResult.date.desc())
|
||||||
.limit(BATCH_SIZE)
|
.limit(BATCH_SIZE)
|
||||||
).all()
|
).all()
|
||||||
|
|||||||
@@ -7,7 +7,17 @@ from dateutil.relativedelta import relativedelta
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from cisticola.transformer.base import Transformer
|
from cisticola.transformer.base import Transformer
|
||||||
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
|
from cisticola.base import (
|
||||||
|
RawChannelInfo,
|
||||||
|
ScraperResult,
|
||||||
|
Post,
|
||||||
|
Image,
|
||||||
|
Video,
|
||||||
|
Media,
|
||||||
|
Channel,
|
||||||
|
ChannelInfo,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class BitchuteTransformer(Transformer):
|
class BitchuteTransformer(Transformer):
|
||||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||||
@@ -15,61 +25,86 @@ class BitchuteTransformer(Transformer):
|
|||||||
__version__ = "BitchuteTransformer 0.0.2"
|
__version__ = "BitchuteTransformer 0.0.2"
|
||||||
|
|
||||||
def can_handle(self, data: ScraperResult) -> bool:
|
def can_handle(self, data: ScraperResult) -> bool:
|
||||||
scraper = data.scraper.split(' ')
|
scraper = data.scraper.split(" ")
|
||||||
if scraper[0] == "BitchuteScraper":
|
if scraper[0] == "BitchuteScraper":
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable) -> Generator[Media, None, None]:
|
def transform_media(
|
||||||
|
self, data: ScraperResult, transformed: Post, insert: Callable
|
||||||
|
) -> Generator[Media, None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
orig = raw['video_url']
|
orig = raw["video_url"]
|
||||||
new = data.archived_urls[orig]
|
new = data.archived_urls[orig]
|
||||||
|
|
||||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
|
m = Video(
|
||||||
|
url=new,
|
||||||
|
post=transformed.id,
|
||||||
|
raw_id=data.id,
|
||||||
|
original_url=orig,
|
||||||
|
date=data.date,
|
||||||
|
date_archived=data.date_archived,
|
||||||
|
date_transformed=datetime.now(timezone.utc),
|
||||||
|
transformer=self.__version__,
|
||||||
|
scraper=data.scraper,
|
||||||
|
platform=data.platform,
|
||||||
|
)
|
||||||
|
|
||||||
insert(m)
|
insert(m)
|
||||||
|
|
||||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform_info(
|
||||||
|
self, data: RawChannelInfo, insert: Callable, session, channel=None
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
transformed = ChannelInfo(
|
transformed = ChannelInfo(
|
||||||
raw_channel_info_id=data.id,
|
raw_channel_info_id=data.id,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
platform_id=raw['owner_url'].strip('/').split('/')[-1],
|
platform_id=raw["owner_url"].strip("/").split("/")[-1],
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
screenname=raw['owner_name'],
|
screenname=raw["owner_name"],
|
||||||
name=raw['owner_name'],
|
name=raw["owner_name"],
|
||||||
description=raw['description'],
|
description=raw["description"],
|
||||||
description_url='', # does not exist for Bitchute
|
description_url="", # does not exist for Bitchute
|
||||||
description_location='', # does not exist for Bitchute
|
description_location="", # does not exist for Bitchute
|
||||||
followers=raw['subscribers'],
|
followers=raw["subscribers"],
|
||||||
following=-1, # does not exist for Bitchute
|
following=-1, # does not exist for Bitchute
|
||||||
verified=False, # does not exist for Bitchute
|
verified=False, # does not exist for Bitchute
|
||||||
date_created=parse_created(raw['created'], data.date_archived),
|
date_created=parse_created(raw["created"], data.date_archived),
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc)
|
date_transformed=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
|
|
||||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform(
|
||||||
|
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
if raw['category'] == 'comment':
|
if raw["category"] == "comment":
|
||||||
if raw['parent_id'] is None:
|
if raw["parent_id"] is None:
|
||||||
reply_to_id = raw['thread_id']
|
reply_to_id = raw["thread_id"]
|
||||||
else:
|
else:
|
||||||
reply_to_id = raw['parent_id']
|
reply_to_id = raw["parent_id"]
|
||||||
flush_posts()
|
flush_posts()
|
||||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
|
post = (
|
||||||
|
session.query(Post)
|
||||||
|
.filter_by(channel=data.channel, platform_id=reply_to_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
if post is None:
|
if post is None:
|
||||||
if raw['parent_id'] is not None:
|
if raw["parent_id"] is not None:
|
||||||
# this block is for comments whose parent_ids correspond to deleted comments
|
# this block is for comments whose parent_ids correspond to deleted comments
|
||||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=raw['thread_id']).first()
|
post = (
|
||||||
|
session.query(Post)
|
||||||
|
.filter_by(channel=data.channel, platform_id=raw["thread_id"])
|
||||||
|
.first()
|
||||||
|
)
|
||||||
if post is None:
|
if post is None:
|
||||||
reply_to = -1
|
reply_to = -1
|
||||||
else:
|
else:
|
||||||
@@ -78,18 +113,18 @@ class BitchuteTransformer(Transformer):
|
|||||||
reply_to = -1
|
reply_to = -1
|
||||||
else:
|
else:
|
||||||
reply_to = post.id
|
reply_to = post.id
|
||||||
content = raw['body'].strip()
|
content = raw["body"].strip()
|
||||||
else:
|
else:
|
||||||
reply_to = -1
|
reply_to = -1
|
||||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
soup = BeautifulSoup(raw["body"], features="html.parser")
|
||||||
soup.find('div', {'class': 'teaser'}).decompose()
|
soup.find("div", {"class": "teaser"}).decompose()
|
||||||
soup.find('span', {'class': 'more'}).decompose()
|
soup.find("span", {"class": "more"}).decompose()
|
||||||
soup.find('span', {'class': 'less hidden'}).decompose()
|
soup.find("span", {"class": "less hidden"}).decompose()
|
||||||
content = soup.text.strip()
|
content = soup.text.strip()
|
||||||
|
|
||||||
transformed = Post(
|
transformed = Post(
|
||||||
raw_id=data.id,
|
raw_id=data.id,
|
||||||
platform_id=raw['id'],
|
platform_id=raw["id"],
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
@@ -97,20 +132,24 @@ class BitchuteTransformer(Transformer):
|
|||||||
date=data.date,
|
date=data.date,
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc),
|
date_transformed=datetime.now(timezone.utc),
|
||||||
url=raw['url'] if raw['url'] else None,
|
url=raw["url"] if raw["url"] else None,
|
||||||
content=content,
|
content=content,
|
||||||
author_id=raw['author_id'],
|
author_id=raw["author_id"],
|
||||||
author_username=raw['author'],
|
author_username=raw["author"],
|
||||||
reply_to=reply_to,
|
reply_to=reply_to,
|
||||||
hashtags = list(filter(None, [h.strip('#') for h in raw['hashtags'].split(',')])),
|
hashtags=list(
|
||||||
likes = raw['likes'],
|
filter(None, [h.strip("#") for h in raw["hashtags"].split(",")])
|
||||||
views = int(raw['views']) if raw.get('views') else None,
|
),
|
||||||
video_title = raw['subject'],
|
likes=raw["likes"],
|
||||||
video_duration = _parse_duration_str(raw['length']))
|
views=int(raw["views"]) if raw.get("views") else None,
|
||||||
|
video_title=raw["subject"],
|
||||||
|
video_duration=_parse_duration_str(raw["length"]),
|
||||||
|
)
|
||||||
|
|
||||||
# insert_post
|
# insert_post
|
||||||
transformed = insert_post(transformed)
|
transformed = insert_post(transformed)
|
||||||
|
|
||||||
|
|
||||||
def parse_created(created: str, date_archived: datetime) -> datetime:
|
def parse_created(created: str, date_archived: datetime) -> datetime:
|
||||||
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
|
"""Convert a created string (e.g. ``"1 year, 10 months ago"``) to a datetime
|
||||||
object relative to the specified ``date_archived``.
|
object relative to the specified ``date_archived``.
|
||||||
@@ -119,19 +158,26 @@ def parse_created(created: str, date_archived: datetime) -> datetime:
|
|||||||
# handle case where `created` string has already been parsed into a datetime
|
# handle case where `created` string has already been parsed into a datetime
|
||||||
return datetime.fromisoformat(created)
|
return datetime.fromisoformat(created)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
period_list = ['year', 'month', 'week', 'day']
|
period_list = ["year", "month", "week", "day"]
|
||||||
|
|
||||||
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
|
periods = [
|
||||||
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
|
period.strip() for period in created.split("ago")[0].strip().split(",")
|
||||||
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
|
]
|
||||||
|
_kwargs = {
|
||||||
|
period: int(number)
|
||||||
|
for period, number in dict(reversed(p.split(" ")) for p in periods).items()
|
||||||
|
}
|
||||||
|
kwargs = {(k + "s" if k in period_list else k): v for k, v in _kwargs.items()}
|
||||||
|
|
||||||
return date_archived - relativedelta(**kwargs)
|
return date_archived - relativedelta(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _parse_duration_str(duration_str: str) -> int:
|
def _parse_duration_str(duration_str: str) -> int:
|
||||||
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
|
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
|
||||||
"""
|
|
||||||
if not duration_str:
|
if not duration_str:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
duration_list = duration_str.split(':')
|
duration_list = duration_str.split(":")
|
||||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
|
return sum(
|
||||||
|
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
|
||||||
|
)
|
||||||
|
|||||||
@@ -8,7 +8,17 @@ from gogettr import PublicClient
|
|||||||
from gogettr.api import GettrApiError
|
from gogettr.api import GettrApiError
|
||||||
|
|
||||||
from cisticola.transformer.base import Transformer
|
from cisticola.transformer.base import Transformer
|
||||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
from cisticola.base import (
|
||||||
|
RawChannelInfo,
|
||||||
|
ChannelInfo,
|
||||||
|
ScraperResult,
|
||||||
|
Post,
|
||||||
|
Image,
|
||||||
|
Video,
|
||||||
|
Media,
|
||||||
|
Channel,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GettrTransformer(Transformer):
|
class GettrTransformer(Transformer):
|
||||||
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
|
"""A Gettr specific ScraperResult, with a method ETL/transforming"""
|
||||||
@@ -16,50 +26,58 @@ class GettrTransformer(Transformer):
|
|||||||
__version__ = "GettrTransformer 0.0.1"
|
__version__ = "GettrTransformer 0.0.1"
|
||||||
|
|
||||||
def can_handle(self, data: ScraperResult) -> bool:
|
def can_handle(self, data: ScraperResult) -> bool:
|
||||||
scraper = data.scraper.split(' ')
|
scraper = data.scraper.split(" ")
|
||||||
if scraper[0] == "GettrScraper":
|
if scraper[0] == "GettrScraper":
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform_info(
|
||||||
|
self, data: RawChannelInfo, insert: Callable, session, channel=None
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
transformed = ChannelInfo(
|
transformed = ChannelInfo(
|
||||||
raw_channel_info_id=data.id,
|
raw_channel_info_id=data.id,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
platform_id=raw['_id'],
|
platform_id=raw["_id"],
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
screenname=raw['username'],
|
screenname=raw["username"],
|
||||||
name=raw['nickname'],
|
name=raw["nickname"],
|
||||||
description=raw.get('dsc'),
|
description=raw.get("dsc"),
|
||||||
description_url=raw.get('website'),
|
description_url=raw.get("website"),
|
||||||
description_location=raw.get('location'),
|
description_location=raw.get("location"),
|
||||||
followers=int(raw['flg']),
|
followers=int(raw["flg"]),
|
||||||
following=int(raw['flw']),
|
following=int(raw["flw"]),
|
||||||
verified=True if raw.get('infl') else False,
|
verified=True if raw.get("infl") else False,
|
||||||
date_created=datetime.fromtimestamp(int(raw['cdate'])*0.001),
|
date_created=datetime.fromtimestamp(int(raw["cdate"]) * 0.001),
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc)
|
date_transformed=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
|
|
||||||
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
||||||
|
channel = (
|
||||||
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first()
|
session.query(Channel)
|
||||||
|
.where(
|
||||||
|
(func.lower(Channel.screenname) == func.lower(username))
|
||||||
|
& (Channel.platform == "Gettr")
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
if channel is None:
|
if channel is None:
|
||||||
try:
|
try:
|
||||||
client = PublicClient()
|
client = PublicClient()
|
||||||
profile = client.user_info(username.lower())
|
profile = client.user_info(username.lower())
|
||||||
screenname = profile.get('_id')
|
screenname = profile.get("_id")
|
||||||
channel = Channel(
|
channel = Channel(
|
||||||
name=profile.get('nickname'),
|
name=profile.get("nickname"),
|
||||||
platform_id=screenname,
|
platform_id=screenname,
|
||||||
platform='Gettr',
|
platform="Gettr",
|
||||||
url="https://gettr.com/user/" + screenname,
|
url="https://gettr.com/user/" + screenname,
|
||||||
screenname=screenname,
|
screenname=screenname,
|
||||||
category=category,
|
category=category,
|
||||||
@@ -69,31 +87,41 @@ class GettrTransformer(Transformer):
|
|||||||
channel = Channel(
|
channel = Channel(
|
||||||
name=None,
|
name=None,
|
||||||
platform_id=None,
|
platform_id=None,
|
||||||
platform = 'Gettr',
|
platform="Gettr",
|
||||||
url=None,
|
url=None,
|
||||||
screenname=username,
|
screenname=username,
|
||||||
category=category,
|
category=category,
|
||||||
source=self.__version__,
|
source=self.__version__,
|
||||||
notes='GettrApiError'
|
notes="GettrApiError",
|
||||||
)
|
)
|
||||||
|
|
||||||
channel = insert(channel)
|
channel = insert(channel)
|
||||||
|
|
||||||
return channel.id
|
return channel.id
|
||||||
|
|
||||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform(
|
||||||
|
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
if raw["activity"]["action"] == "shares_pst":
|
if raw["activity"]["action"] == "shares_pst":
|
||||||
forwarded_from = self._get_channel_id(
|
forwarded_from = self._get_channel_id(
|
||||||
username = str(raw["activity"]["uid"]), category = 'forwarded', insert = insert, session = session)
|
username=str(raw["activity"]["uid"]),
|
||||||
|
category="forwarded",
|
||||||
|
insert=insert,
|
||||||
|
session=session,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
forwarded_from = None
|
forwarded_from = None
|
||||||
|
|
||||||
mentions = []
|
mentions = []
|
||||||
for mentioned_user in raw.get("utgs", []):
|
for mentioned_user in raw.get("utgs", []):
|
||||||
mentioned_id = self._get_channel_id(
|
mentioned_id = self._get_channel_id(
|
||||||
username = mentioned_user, category = 'mentioned', insert = insert, session = session)
|
username=mentioned_user,
|
||||||
|
category="mentioned",
|
||||||
|
insert=insert,
|
||||||
|
session=session,
|
||||||
|
)
|
||||||
mentions.append(mentioned_id)
|
mentions.append(mentioned_id)
|
||||||
|
|
||||||
transformed = Post(
|
transformed = Post(
|
||||||
@@ -114,9 +142,9 @@ class GettrTransformer(Transformer):
|
|||||||
outlinks=list(filter(None, [raw.get("prevsrc")])),
|
outlinks=list(filter(None, [raw.get("prevsrc")])),
|
||||||
forwarded_from=forwarded_from,
|
forwarded_from=forwarded_from,
|
||||||
mentions=mentions,
|
mentions=mentions,
|
||||||
likes = raw.get('lkbpst'),
|
likes=raw.get("lkbpst"),
|
||||||
forwards=raw.get("shbpst"),
|
forwards=raw.get("shbpst"),
|
||||||
views = raw.get('vfpst')
|
views=raw.get("vfpst"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
# insert_post
|
||||||
|
|||||||
@@ -6,7 +6,17 @@ from datetime import datetime, timezone
|
|||||||
from sqlalchemy import func, JSON, String, cast, text
|
from sqlalchemy import func, JSON, String, cast, text
|
||||||
|
|
||||||
from cisticola.transformer.base import Transformer
|
from cisticola.transformer.base import Transformer
|
||||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
from cisticola.base import (
|
||||||
|
RawChannelInfo,
|
||||||
|
ChannelInfo,
|
||||||
|
ScraperResult,
|
||||||
|
Post,
|
||||||
|
Image,
|
||||||
|
Video,
|
||||||
|
Media,
|
||||||
|
Channel,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class RumbleTransformer(Transformer):
|
class RumbleTransformer(Transformer):
|
||||||
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
|
"""A Rumble specific ScraperResult, with a method ETL/transforming"""
|
||||||
@@ -14,25 +24,36 @@ class RumbleTransformer(Transformer):
|
|||||||
__version__ = "RumbleTransformer 0.0.1"
|
__version__ = "RumbleTransformer 0.0.1"
|
||||||
|
|
||||||
def can_handle(self, data: ScraperResult) -> bool:
|
def can_handle(self, data: ScraperResult) -> bool:
|
||||||
scraper = data.scraper.split(' ')
|
scraper = data.scraper.split(" ")
|
||||||
if scraper[0] == "RumbleScraper":
|
if scraper[0] == "RumbleScraper":
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform_info(
|
||||||
|
self, data: RawChannelInfo, insert: Callable, session, channel=None
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
if 'id' not in raw:
|
if "id" not in raw:
|
||||||
# The first version of the Rumble ChannelInfo scraper didn't return
|
# The first version of the Rumble ChannelInfo scraper didn't return
|
||||||
# the platform_id, so this is a workaround.
|
# the platform_id, so this is a workaround.
|
||||||
channel = session.query(RawChannelInfo).filter(text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"), RawChannelInfo.platform == 'Rumble').params(name=raw['name']).order_by(RawChannelInfo.date_archived.desc()).first()
|
channel = (
|
||||||
|
session.query(RawChannelInfo)
|
||||||
|
.filter(
|
||||||
|
text("raw_channel_info.raw_data::jsonb ->> 'name'=:name"),
|
||||||
|
RawChannelInfo.platform == "Rumble",
|
||||||
|
)
|
||||||
|
.params(name=raw["name"])
|
||||||
|
.order_by(RawChannelInfo.date_archived.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
if channel is None:
|
if channel is None:
|
||||||
platform_id = None
|
platform_id = None
|
||||||
else:
|
else:
|
||||||
platform_id = json.loads(channel.raw_data)['id']
|
platform_id = json.loads(channel.raw_data)["id"]
|
||||||
else:
|
else:
|
||||||
platform_id = raw['id']
|
platform_id = raw["id"]
|
||||||
|
|
||||||
transformed = ChannelInfo(
|
transformed = ChannelInfo(
|
||||||
raw_channel_info_id=data.id,
|
raw_channel_info_id=data.id,
|
||||||
@@ -42,63 +63,67 @@ class RumbleTransformer(Transformer):
|
|||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
screenname=platform_id,
|
screenname=platform_id,
|
||||||
name=raw['name'],
|
name=raw["name"],
|
||||||
description='', # does not exist for Rumble
|
description="", # does not exist for Rumble
|
||||||
description_url='', # does not exist for Rumble
|
description_url="", # does not exist for Rumble
|
||||||
description_location='', # does not exist for Rumble
|
description_location="", # does not exist for Rumble
|
||||||
followers=_process_number(raw['subscribers']),
|
followers=_process_number(raw["subscribers"]),
|
||||||
following=-1, # does not exist for Rumble
|
following=-1, # does not exist for Rumble
|
||||||
verified=raw['verified'],
|
verified=raw["verified"],
|
||||||
date_created=None, # does not exist for Rumble
|
date_created=None, # does not exist for Rumble
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc)
|
date_transformed=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
|
|
||||||
|
def transform(
|
||||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
transformed = Post(
|
transformed = Post(
|
||||||
raw_id=data.id,
|
raw_id=data.id,
|
||||||
platform_id=raw['media_url'].strip('/').split('/')[-1],
|
platform_id=raw["media_url"].strip("/").split("/")[-1],
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
date=dateutil.parser.parse(raw['datetime']),
|
date=dateutil.parser.parse(raw["datetime"]),
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc),
|
date_transformed=datetime.now(timezone.utc),
|
||||||
url=raw['link'],
|
url=raw["link"],
|
||||||
content=raw['content'],
|
content=raw["content"],
|
||||||
author_id=raw['author_id'],
|
author_id=raw["author_id"],
|
||||||
author_username=raw['author_name'],
|
author_username=raw["author_name"],
|
||||||
views = _process_number(raw.get('views')),
|
views=_process_number(raw.get("views")),
|
||||||
likes = _process_number(raw.get('rumbles')),
|
likes=_process_number(raw.get("rumbles")),
|
||||||
video_title = raw['title'],
|
video_title=raw["title"],
|
||||||
video_duration=_parse_duration_str(raw['duration']))
|
video_duration=_parse_duration_str(raw["duration"]),
|
||||||
|
)
|
||||||
|
|
||||||
# insert_post
|
# insert_post
|
||||||
insert_post(transformed)
|
insert_post(transformed)
|
||||||
|
|
||||||
def _process_number(s):
|
|
||||||
|
|
||||||
|
def _process_number(s):
|
||||||
if s is None:
|
if s is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
s = s.replace(' ', '').replace(',','')
|
s = s.replace(" ", "").replace(",", "")
|
||||||
if s.endswith('M'):
|
if s.endswith("M"):
|
||||||
return int(float(s[:-1]) * 1e6)
|
return int(float(s[:-1]) * 1e6)
|
||||||
elif s.endswith('K'):
|
elif s.endswith("K"):
|
||||||
return int(float(s[:-1]) * 1000)
|
return int(float(s[:-1]) * 1000)
|
||||||
return int(s)
|
return int(s)
|
||||||
|
|
||||||
|
|
||||||
def _parse_duration_str(duration_str: str) -> int:
|
def _parse_duration_str(duration_str: str) -> int:
|
||||||
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824).
|
"""Convert duration string (e.g. '2:27:04') to the number of seconds (e.g. 8824)."""
|
||||||
"""
|
|
||||||
if not duration_str:
|
if not duration_str:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
duration_list = duration_str.split(':')
|
duration_list = duration_str.split(":")
|
||||||
return sum([int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))])
|
return sum(
|
||||||
|
[int(s) * int(g) for s, g in zip([1, 60, 3600], reversed(duration_list))]
|
||||||
|
)
|
||||||
|
|||||||
@@ -17,11 +17,21 @@ from datetime import datetime, timezone
|
|||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
|
||||||
from cisticola.transformer.base import Transformer
|
from cisticola.transformer.base import Transformer
|
||||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Audio, Media, Channel
|
from cisticola.base import (
|
||||||
|
RawChannelInfo,
|
||||||
|
ChannelInfo,
|
||||||
|
ScraperResult,
|
||||||
|
Post,
|
||||||
|
Image,
|
||||||
|
Video,
|
||||||
|
Audio,
|
||||||
|
Media,
|
||||||
|
Channel,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TelegramTelethonTransformer(Transformer):
|
class TelegramTelethonTransformer(Transformer):
|
||||||
__version__ = 'TelegramTelethonTransformer 0.0.4'
|
__version__ = "TelegramTelethonTransformer 0.0.4"
|
||||||
|
|
||||||
# TODO cache
|
# TODO cache
|
||||||
# cache channels for which we cannot get the name from the web interface
|
# cache channels for which we cannot get the name from the web interface
|
||||||
@@ -38,7 +48,7 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
get_screenname_cache = {}
|
get_screenname_cache = {}
|
||||||
|
|
||||||
def can_handle(self, data: ScraperResult) -> bool:
|
def can_handle(self, data: ScraperResult) -> bool:
|
||||||
scraper = data.scraper.split(' ')
|
scraper = data.scraper.split(" ")
|
||||||
if scraper[0] == "TelegramTelethonScraper":
|
if scraper[0] == "TelegramTelethonScraper":
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -47,9 +57,9 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
def __init__(self, telethon_session_name=None):
|
def __init__(self, telethon_session_name=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
api_id = os.environ['TELEGRAM_API_ID']
|
api_id = os.environ["TELEGRAM_API_ID"]
|
||||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
api_hash = os.environ["TELEGRAM_API_HASH"]
|
||||||
phone = os.environ['TELEGRAM_PHONE']
|
phone = os.environ["TELEGRAM_PHONE"]
|
||||||
|
|
||||||
if telethon_session_name is None:
|
if telethon_session_name is None:
|
||||||
telethon_session_name = phone
|
telethon_session_name = phone
|
||||||
@@ -67,7 +77,11 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
try:
|
try:
|
||||||
data = self.client.get_entity(channel_id)
|
data = self.client.get_entity(channel_id)
|
||||||
if isinstance(data, types.User):
|
if isinstance(data, types.User):
|
||||||
output = (data.username, str(data.first_name or "") + " " + str(data.last_name or ""), "")
|
output = (
|
||||||
|
data.username,
|
||||||
|
str(data.first_name or "") + " " + str(data.last_name or ""),
|
||||||
|
"",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
output = (data.username, data.title, "")
|
output = (data.username, data.title, "")
|
||||||
except ChannelPrivateError:
|
except ChannelPrivateError:
|
||||||
@@ -85,7 +99,9 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
|
|
||||||
# this doesn't work for chat channels
|
# this doesn't work for chat channels
|
||||||
if orig_screenname in self.bad_channels:
|
if orig_screenname in self.bad_channels:
|
||||||
logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}")
|
logger.debug(
|
||||||
|
f"Skipping screenname because it is not accessible for channel {orig_screenname}"
|
||||||
|
)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
logger.info(f"Finding channel from URL {url}")
|
logger.info(f"Finding channel from URL {url}")
|
||||||
@@ -95,7 +111,7 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
self.bad_channels[orig_screenname] = True
|
self.bad_channels[orig_screenname] = True
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
soup = BeautifulSoup(r.content, features="lxml")
|
||||||
post = soup.findAll("div", {"data-post": orig_screenname + "/" + str(id)})
|
post = soup.findAll("div", {"data-post": orig_screenname + "/" + str(id)})
|
||||||
name = ""
|
name = ""
|
||||||
|
|
||||||
@@ -106,127 +122,173 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
if decrement > 8:
|
if decrement > 8:
|
||||||
break
|
break
|
||||||
|
|
||||||
logger.info(f"Could not find post from {url}, looking for id {id - decrement}")
|
logger.info(
|
||||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)})
|
f"Could not find post from {url}, looking for id {id - decrement}"
|
||||||
|
)
|
||||||
|
post = soup.findAll(
|
||||||
|
"div", {"data-post": orig_screenname + "/" + str(id - decrement)}
|
||||||
|
)
|
||||||
|
|
||||||
if len(post) == 0:
|
if len(post) == 0:
|
||||||
logger.warning(f"Could not find post from {url}")
|
logger.warning(f"Could not find post from {url}")
|
||||||
else:
|
else:
|
||||||
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})
|
fwd_tag = post[0].findAll(
|
||||||
|
"a", {"class", "tgme_widget_message_forwarded_from_name"}
|
||||||
|
)
|
||||||
|
|
||||||
if len(fwd_tag) == 0:
|
if len(fwd_tag) == 0:
|
||||||
fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"})
|
fwd_tag = post[0].findAll(
|
||||||
|
"span", {"class", "tgme_widget_message_forwarded_from_name"}
|
||||||
|
)
|
||||||
|
|
||||||
if len(fwd_tag) >= 1:
|
if len(fwd_tag) >= 1:
|
||||||
name = fwd_tag[0].text
|
name = fwd_tag[0].text
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session, channel=None) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform_info(
|
||||||
|
self, data: RawChannelInfo, insert: Callable, session, channel=None
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
chat_raw = raw['chats'][0]
|
chat_raw = raw["chats"][0]
|
||||||
|
|
||||||
transformed = ChannelInfo(
|
transformed = ChannelInfo(
|
||||||
raw_channel_info_id=data.id,
|
raw_channel_info_id=data.id,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
platform_id=raw['full_chat']['id'],
|
platform_id=raw["full_chat"]["id"],
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
screenname=chat_raw['username'],
|
screenname=chat_raw["username"],
|
||||||
name=chat_raw['title'],
|
name=chat_raw["title"],
|
||||||
description=raw['full_chat']['about'],
|
description=raw["full_chat"]["about"],
|
||||||
description_url='', # does not exist for Telegram
|
description_url="", # does not exist for Telegram
|
||||||
description_location='', # does not exist for Telegram
|
description_location="", # does not exist for Telegram
|
||||||
followers=raw['full_chat']['participants_count'],
|
followers=raw["full_chat"]["participants_count"],
|
||||||
following=-1, # does not exist for Telegram
|
following=-1, # does not exist for Telegram
|
||||||
verified=False, # does not exist for Telegram
|
verified=False, # does not exist for Telegram
|
||||||
date_created=dateutil.parser.parse(chat_raw['date']),
|
date_created=dateutil.parser.parse(chat_raw["date"]),
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc)
|
date_transformed=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
|
||||||
transformed = insert(transformed)
|
transformed = insert(transformed)
|
||||||
|
|
||||||
if channel.platform_id is None:
|
if channel.platform_id is None:
|
||||||
logger.info(f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}")
|
logger.info(
|
||||||
|
f"Missing platform ID on {channel}, setting to {raw['full_chat']['id']}"
|
||||||
|
)
|
||||||
|
|
||||||
new_channel = session.query(Channel).where(Channel.id == channel.id).one()
|
new_channel = session.query(Channel).where(Channel.id == channel.id).one()
|
||||||
new_channel.platform_id = raw['full_chat']['id']
|
new_channel.platform_id = raw["full_chat"]["id"]
|
||||||
session.flush()
|
session.flush()
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
if len(raw['chats']) > 1:
|
if len(raw["chats"]) > 1:
|
||||||
for chat in raw['chats'][1:]:
|
for chat in raw["chats"][1:]:
|
||||||
new_chat = Channel(
|
new_chat = Channel(
|
||||||
name=chat["title"],
|
name=chat["title"],
|
||||||
platform_id=chat["id"],
|
platform_id=chat["id"],
|
||||||
category=channel.category, # this should be the same as the "parent"
|
category=channel.category, # this should be the same as the "parent"
|
||||||
platform=channel.platform, # this should be the same as the "parent"
|
platform=channel.platform, # this should be the same as the "parent"
|
||||||
url=("https://t.me/s/" + chat["username"]) if "username" in chat else "",
|
url=("https://t.me/s/" + chat["username"])
|
||||||
|
if "username" in chat
|
||||||
|
else "",
|
||||||
screenname=chat["username"] if "username" in chat else "",
|
screenname=chat["username"] if "username" in chat else "",
|
||||||
country=channel.country, # this should be the same as the "parent"
|
country=channel.country, # this should be the same as the "parent"
|
||||||
influencer=channel.influencer, # this should be the same as the "parent"
|
influencer=channel.influencer, # this should be the same as the "parent"
|
||||||
public=None,
|
public=None,
|
||||||
chat=not chat["broadcast"],
|
chat=not chat["broadcast"],
|
||||||
notes=channel.id, # this should be the channel ID of the parent
|
notes=channel.id, # this should be the channel ID of the parent
|
||||||
source="linked_channel"
|
source="linked_channel",
|
||||||
)
|
)
|
||||||
|
|
||||||
insert(new_chat)
|
insert(new_chat)
|
||||||
|
|
||||||
# TODO this method API is chaotic and could be cleaned up
|
# TODO this method API is chaotic and could be cleaned up
|
||||||
def transform(self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts) -> Generator[Union[Post, Channel, Media], None, None]:
|
def transform(
|
||||||
|
self, data: ScraperResult, insert: Callable, session, insert_post, flush_posts
|
||||||
|
) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||||
raw = json.loads(data.raw_data)
|
raw = json.loads(data.raw_data)
|
||||||
|
|
||||||
if raw['_'] != 'Message':
|
if raw["_"] != "Message":
|
||||||
logger.warning(f"Cannot convert type {raw['_']} to post")
|
logger.warning(f"Cannot convert type {raw['_']} to post")
|
||||||
return
|
return
|
||||||
|
|
||||||
fwd_from = None
|
fwd_from = None
|
||||||
|
|
||||||
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
|
if (
|
||||||
|
raw["fwd_from"]
|
||||||
|
and raw["fwd_from"]["from_id"]
|
||||||
|
and "channel_id" in raw["fwd_from"]["from_id"]
|
||||||
|
):
|
||||||
# use cache to look up channel instead of a DB request if possible
|
# use cache to look up channel instead of a DB request if possible
|
||||||
if str(raw['fwd_from']['from_id']['channel_id']) not in self.channels_cache_by_platformid:
|
if (
|
||||||
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id']), platform = 'Telegram').first()
|
str(raw["fwd_from"]["from_id"]["channel_id"])
|
||||||
|
not in self.channels_cache_by_platformid
|
||||||
|
):
|
||||||
|
channel = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter_by(
|
||||||
|
platform_id=str(raw["fwd_from"]["from_id"]["channel_id"]),
|
||||||
|
platform="Telegram",
|
||||||
|
)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
if channel is None:
|
if channel is None:
|
||||||
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
|
(screenname, name, notes) = self.get_screenname_from_id(
|
||||||
|
raw["fwd_from"]["from_id"]["channel_id"]
|
||||||
|
)
|
||||||
|
|
||||||
if name == "":
|
if name == "":
|
||||||
logger.info("Trying fallback web interface")
|
logger.info("Trying fallback web interface")
|
||||||
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
|
orig_channel = (
|
||||||
|
session.query(Channel).filter_by(id=data.channel).first()
|
||||||
|
)
|
||||||
if orig_channel.screenname is not None:
|
if orig_channel.screenname is not None:
|
||||||
name = self.get_name_from_web_interface(orig_channel.screenname, raw['id'])
|
name = self.get_name_from_web_interface(
|
||||||
|
orig_channel.screenname, raw["id"]
|
||||||
|
)
|
||||||
|
|
||||||
channel = Channel(
|
channel = Channel(
|
||||||
name=name,
|
name=name,
|
||||||
platform_id=raw['fwd_from']['from_id']['channel_id'],
|
platform_id=raw["fwd_from"]["from_id"]["channel_id"],
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
url="https://t.me/s/" + screenname if screenname is not None else "",
|
url="https://t.me/s/" + screenname
|
||||||
|
if screenname is not None
|
||||||
|
else "",
|
||||||
screenname=screenname,
|
screenname=screenname,
|
||||||
category='forwarded',
|
category="forwarded",
|
||||||
source=self.__version__,
|
source=self.__version__,
|
||||||
notes=notes
|
notes=notes,
|
||||||
)
|
)
|
||||||
|
|
||||||
channel = insert(channel)
|
channel = insert(channel)
|
||||||
logger.info(f"Added {channel}")
|
logger.info(f"Added {channel}")
|
||||||
|
|
||||||
self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])] = channel
|
self.channels_cache_by_platformid[
|
||||||
|
str(raw["fwd_from"]["from_id"]["channel_id"])
|
||||||
|
] = channel
|
||||||
|
|
||||||
fwd_from = self.channels_cache_by_platformid[str(raw['fwd_from']['from_id']['channel_id'])].id
|
fwd_from = self.channels_cache_by_platformid[
|
||||||
|
str(raw["fwd_from"]["from_id"]["channel_id"])
|
||||||
|
].id
|
||||||
|
|
||||||
reply_to = None
|
reply_to = None
|
||||||
if raw['reply_to']:
|
if raw["reply_to"]:
|
||||||
reply_to_id = str(raw['reply_to']['reply_to_msg_id'])
|
reply_to_id = str(raw["reply_to"]["reply_to_msg_id"])
|
||||||
|
|
||||||
# use cache to find post ID instead of a DB request, if possible
|
# use cache to find post ID instead of a DB request, if possible
|
||||||
if (data.channel, reply_to_id) not in self.posts_cache:
|
if (data.channel, reply_to_id) not in self.posts_cache:
|
||||||
session.commit()
|
session.commit()
|
||||||
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
|
flush_posts() # TODO this is necessary because the post we are looking for might have been added in the same session
|
||||||
post = session.query(Post).filter_by(channel=data.channel, platform_id=reply_to_id).first()
|
post = (
|
||||||
|
session.query(Post)
|
||||||
|
.filter_by(channel=data.channel, platform_id=reply_to_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
if post is None:
|
if post is None:
|
||||||
reply_to = -1
|
reply_to = -1
|
||||||
else:
|
else:
|
||||||
@@ -238,25 +300,36 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
|
|
||||||
mentions = []
|
mentions = []
|
||||||
|
|
||||||
for mention_entity in [entity for entity in raw['entities'] if entity['_'] == 'MessageEntityMention']:
|
for mention_entity in [
|
||||||
offset = mention_entity['offset']
|
entity
|
||||||
length = mention_entity['length']
|
for entity in raw["entities"]
|
||||||
|
if entity["_"] == "MessageEntityMention"
|
||||||
|
]:
|
||||||
|
offset = mention_entity["offset"]
|
||||||
|
length = mention_entity["length"]
|
||||||
|
|
||||||
screenname = add_surrogate(raw['message'])[offset:offset+length].strip('@').strip()
|
screenname = (
|
||||||
|
add_surrogate(raw["message"])[offset : offset + length]
|
||||||
|
.strip("@")
|
||||||
|
.strip()
|
||||||
|
)
|
||||||
|
|
||||||
# use cache rather than a DB request if possible
|
# use cache rather than a DB request if possible
|
||||||
if screenname.lower() not in self.channels_cache_by_screenname:
|
if screenname.lower() not in self.channels_cache_by_screenname:
|
||||||
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(screenname)).first()
|
channel = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter(func.lower(Channel.screenname) == func.lower(screenname))
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
if channel is None:
|
if channel is None:
|
||||||
|
|
||||||
channel = Channel(
|
channel = Channel(
|
||||||
name=None,
|
name=None,
|
||||||
platform_id=None,
|
platform_id=None,
|
||||||
platform = 'Telegram',
|
platform="Telegram",
|
||||||
url="https://t.me/s/" + screenname,
|
url="https://t.me/s/" + screenname,
|
||||||
screenname=screenname,
|
screenname=screenname,
|
||||||
category='mentioned',
|
category="mentioned",
|
||||||
source=self.__version__,
|
source=self.__version__,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -277,15 +350,15 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
channel = self.channels_cache_by_id[int(data.channel)]
|
channel = self.channels_cache_by_id[int(data.channel)]
|
||||||
|
|
||||||
if channel is not None and channel.url:
|
if channel is not None and channel.url:
|
||||||
url = channel.url.strip('/') + f"/{raw['id']}"
|
url = channel.url.strip("/") + f"/{raw['id']}"
|
||||||
author_username = channel.screenname
|
author_username = channel.screenname
|
||||||
else:
|
else:
|
||||||
url = ""
|
url = ""
|
||||||
author_username = ""
|
author_username = ""
|
||||||
|
|
||||||
author_id = raw.get('peer_id', {}).get('channel_id')
|
author_id = raw.get("peer_id", {}).get("channel_id")
|
||||||
if raw['from_id'] and 'user_id' in raw['from_id']:
|
if raw["from_id"] and "user_id" in raw["from_id"]:
|
||||||
author_id = raw['from_id']['user_id']
|
author_id = raw["from_id"]["user_id"]
|
||||||
author_username = ""
|
author_username = ""
|
||||||
(screenname, name, notes) = self.get_screenname_from_id(author_id)
|
(screenname, name, notes) = self.get_screenname_from_id(author_id)
|
||||||
if screenname:
|
if screenname:
|
||||||
@@ -293,12 +366,12 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
|
|
||||||
transformed = Post(
|
transformed = Post(
|
||||||
raw_id=data.id,
|
raw_id=data.id,
|
||||||
platform_id = raw['id'],
|
platform_id=raw["id"],
|
||||||
scraper=data.scraper,
|
scraper=data.scraper,
|
||||||
transformer=self.__version__,
|
transformer=self.__version__,
|
||||||
platform=data.platform,
|
platform=data.platform,
|
||||||
channel=data.channel,
|
channel=data.channel,
|
||||||
date=dateutil.parser.parse(raw['date']),
|
date=dateutil.parser.parse(raw["date"]),
|
||||||
date_archived=data.date_archived,
|
date_archived=data.date_archived,
|
||||||
date_transformed=datetime.now(timezone.utc),
|
date_transformed=datetime.now(timezone.utc),
|
||||||
url=url,
|
url=url,
|
||||||
@@ -308,47 +381,56 @@ class TelegramTelethonTransformer(Transformer):
|
|||||||
forwarded_from=fwd_from,
|
forwarded_from=fwd_from,
|
||||||
reply_to=reply_to,
|
reply_to=reply_to,
|
||||||
mentions=mentions,
|
mentions=mentions,
|
||||||
forwards = raw.get('forwards'),
|
forwards=raw.get("forwards"),
|
||||||
views = raw.get('views')
|
views=raw.get("views"),
|
||||||
)
|
)
|
||||||
|
|
||||||
# insert_post
|
# insert_post
|
||||||
insert_post(transformed)
|
insert_post(transformed)
|
||||||
|
|
||||||
|
|
||||||
def stripped(s):
|
def stripped(s):
|
||||||
"""https://stackoverflow.com/a/29933716"""
|
"""https://stackoverflow.com/a/29933716"""
|
||||||
|
|
||||||
lstripped = ''.join(takewhile(str.isspace, s))
|
lstripped = "".join(takewhile(str.isspace, s))
|
||||||
rstripped = ''.join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
|
rstripped = "".join(reversed(tuple(takewhile(str.isspace, reversed(s)))))
|
||||||
|
|
||||||
return lstripped + rstripped
|
return lstripped + rstripped
|
||||||
|
|
||||||
|
|
||||||
def add_markdown_links(raw_post):
|
def add_markdown_links(raw_post):
|
||||||
"""This function is necessary because Telethon's markdown.unparse doesn't
|
"""This function is necessary because Telethon's markdown.unparse doesn't
|
||||||
correctly handle trailing whitespace or multi-line links"""
|
correctly handle trailing whitespace or multi-line links"""
|
||||||
|
|
||||||
global_offset = 0
|
global_offset = 0
|
||||||
transformed_content = add_surrogate(raw_post['message'])
|
transformed_content = add_surrogate(raw_post["message"])
|
||||||
links = [entity for entity in raw_post['entities'] if entity['_'] == 'MessageEntityTextUrl']
|
links = [
|
||||||
|
entity
|
||||||
|
for entity in raw_post["entities"]
|
||||||
|
if entity["_"] == "MessageEntityTextUrl"
|
||||||
|
]
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
offset = global_offset + link['offset']
|
offset = global_offset + link["offset"]
|
||||||
length = link['length']
|
length = link["length"]
|
||||||
url = link['url']
|
url = link["url"]
|
||||||
|
|
||||||
before_link = transformed_content[:offset]
|
before_link = transformed_content[:offset]
|
||||||
inner_text = transformed_content[offset : offset + length]
|
inner_text = transformed_content[offset : offset + length]
|
||||||
|
|
||||||
# skip creation of link if inner link text is only whitespace
|
# skip creation of link if inner link text is only whitespace
|
||||||
if inner_text.replace('\u200b', '').strip():
|
if inner_text.replace("\u200b", "").strip():
|
||||||
|
processed_inner_text = inner_text.strip().replace("\n", "\\\n")
|
||||||
processed_inner_text = inner_text.strip().replace('\n', '\\\n')
|
|
||||||
link_text = f"[{processed_inner_text}]"
|
link_text = f"[{processed_inner_text}]"
|
||||||
trailing_whitespace = stripped(transformed_content[offset:offset+length])
|
trailing_whitespace = stripped(
|
||||||
|
transformed_content[offset : offset + length]
|
||||||
|
)
|
||||||
link_href = f"({url})"
|
link_href = f"({url})"
|
||||||
after_link = transformed_content[offset + length :]
|
after_link = transformed_content[offset + length :]
|
||||||
|
|
||||||
transformed_content = before_link + link_text + link_href + trailing_whitespace + after_link
|
transformed_content = (
|
||||||
global_offset += (4 + len(url) + inner_text.strip().count('\n'))
|
before_link + link_text + link_href + trailing_whitespace + after_link
|
||||||
|
)
|
||||||
|
global_offset += 4 + len(url) + inner_text.strip().count("\n")
|
||||||
|
|
||||||
return del_surrogate(transformed_content)
|
return del_surrogate(transformed_content)
|
||||||
@@ -2,8 +2,8 @@ import requests
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
import time
|
import time
|
||||||
|
|
||||||
def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
|
||||||
|
|
||||||
|
def make_request(url, headers=None, max_retries=5, break_codes=None):
|
||||||
"""Retry request `max_retries` times, while catching arbitrary exceptions.
|
"""Retry request `max_retries` times, while catching arbitrary exceptions.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -33,20 +33,17 @@ def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
r = request_until_200(
|
r = request_until_200(
|
||||||
url = url,
|
url=url, headers=headers, max_retries=max_retries, break_codes=break_codes
|
||||||
headers = headers,
|
)
|
||||||
max_retries = max_retries,
|
|
||||||
break_codes = break_codes)
|
|
||||||
logger.debug(f"Request for url: {url} succeeded")
|
logger.debug(f"Request for url: {url} succeeded")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Request for url: {url} raised exception: [{e}]")
|
logger.warning(f"Request for url: {url} raised exception: [{e}]")
|
||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
|
||||||
|
|
||||||
"""Retry request `max_retries` times, or until the request is successful.
|
def request_until_200(url, headers=None, max_retries=5, break_codes=None):
|
||||||
"""
|
"""Retry request `max_retries` times, or until the request is successful."""
|
||||||
|
|
||||||
if break_codes is None:
|
if break_codes is None:
|
||||||
break_codes = [200]
|
break_codes = [200]
|
||||||
@@ -57,7 +54,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
|||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
|
|
||||||
while r.status_code not in break_codes and n_retries < 5:
|
while r.status_code not in break_codes and n_retries < 5:
|
||||||
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
|
logger.warning(
|
||||||
|
f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}"
|
||||||
|
)
|
||||||
n_retries += 1
|
n_retries += 1
|
||||||
|
|
||||||
# back off subsequent requests
|
# back off subsequent requests
|
||||||
@@ -65,6 +64,8 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
|||||||
r = requests.get(url, headers=headers)
|
r = requests.get(url, headers=headers)
|
||||||
|
|
||||||
if r.status_code not in break_codes:
|
if r.status_code not in break_codes:
|
||||||
raise ValueError(f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts")
|
raise ValueError(
|
||||||
|
f"Request for url: {url} failed with status: {r.status_code} after {max_retries} attempts"
|
||||||
|
)
|
||||||
|
|
||||||
return r
|
return r
|
||||||
@@ -12,14 +12,15 @@
|
|||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, os.path.abspath('../../'))
|
|
||||||
|
sys.path.insert(0, os.path.abspath("../../"))
|
||||||
|
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = 'Cisticola'
|
project = "Cisticola"
|
||||||
copyright = '2022, Bellingcat'
|
copyright = "2022, Bellingcat"
|
||||||
author = 'Bellingcat'
|
author = "Bellingcat"
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -27,10 +28,10 @@ author = 'Bellingcat'
|
|||||||
# Add any Sphinx extension module names here, as strings. They can be
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
# ones.
|
# ones.
|
||||||
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon']
|
extensions = ["sphinx.ext.autodoc", "sphinx.ext.coverage", "sphinx.ext.napoleon"]
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
templates_path = ['_templates']
|
templates_path = ["_templates"]
|
||||||
|
|
||||||
# List of patterns, relative to source directory, that match files and
|
# List of patterns, relative to source directory, that match files and
|
||||||
# directories to ignore when looking for source files.
|
# directories to ignore when looking for source files.
|
||||||
@@ -43,7 +44,7 @@ exclude_patterns = []
|
|||||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
# a list of builtin themes.
|
# a list of builtin themes.
|
||||||
#
|
#
|
||||||
html_theme = 'sphinx_rtd_theme'
|
html_theme = "sphinx_rtd_theme"
|
||||||
|
|
||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
@@ -52,9 +53,9 @@ html_static_path = []
|
|||||||
|
|
||||||
# -- Default flags for autodoc------------------------------------------------
|
# -- Default flags for autodoc------------------------------------------------
|
||||||
|
|
||||||
autodoc_default_options = {'exclude-members': '_sa_class_manager'}
|
autodoc_default_options = {"exclude-members": "_sa_class_manager"}
|
||||||
|
|
||||||
html_favicon = '../images/favicon.ico'
|
html_favicon = "../images/favicon.ico"
|
||||||
html_logo = '../images/cisticola_logo.svg'
|
html_logo = "../images/cisticola_logo.svg"
|
||||||
|
|
||||||
html_theme_options = {'style_nav_header_background': '#292a2b'}
|
html_theme_options = {"style_nav_header_background": "#292a2b"}
|
||||||
|
|||||||
@@ -20,10 +20,12 @@ expected_headers = [
|
|||||||
"chat",
|
"chat",
|
||||||
"notes",
|
"notes",
|
||||||
"normalized_url",
|
"normalized_url",
|
||||||
"to_remove"]
|
"to_remove",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def standardize_country(s):
|
def standardize_country(s):
|
||||||
_s = s.split('(')[0].split('?')[0]
|
_s = s.split("(")[0].split("?")[0]
|
||||||
return _s.strip()
|
return _s.strip()
|
||||||
|
|
||||||
|
|
||||||
@@ -33,7 +35,7 @@ def sync_channels(args, session):
|
|||||||
gc = gspread.service_account(filename="service_account.json")
|
gc = gspread.service_account(filename="service_account.json")
|
||||||
|
|
||||||
# Open a sheet from a spreadsheet in one go
|
# Open a sheet from a spreadsheet in one go
|
||||||
wks = gc.open_by_url(os.environ['GSHEET']).worksheet("channels")
|
wks = gc.open_by_url(os.environ["GSHEET"]).worksheet("channels")
|
||||||
channels = wks.get_all_records(expected_headers=expected_headers)
|
channels = wks.get_all_records(expected_headers=expected_headers)
|
||||||
row = 2
|
row = 2
|
||||||
|
|
||||||
@@ -65,22 +67,30 @@ def sync_channels(args, session):
|
|||||||
if c["platform_id"] != "":
|
if c["platform_id"] != "":
|
||||||
platform_id = c["platform_id"]
|
platform_id = c["platform_id"]
|
||||||
|
|
||||||
|
channel = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter_by(platform_id=str(platform_id), platform=str(c["platform"]))
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not channel:
|
||||||
|
channel = (
|
||||||
|
session.query(Channel)
|
||||||
|
.filter_by(platform=str(c["platform"]), url=str(c["url"]))
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not channel and c["screenname"] != "" and c["screenname"] is not None:
|
||||||
channel = (
|
channel = (
|
||||||
session.query(Channel)
|
session.query(Channel)
|
||||||
.filter_by(
|
.filter_by(
|
||||||
platform_id=str(platform_id), platform=str(c["platform"])
|
platform=str(c["platform"]), screenname=str(c["screenname"])
|
||||||
)
|
)
|
||||||
.first()
|
.first()
|
||||||
)
|
)
|
||||||
|
|
||||||
if not channel:
|
if not channel:
|
||||||
channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first()
|
if all([k in [None, True, False, ""] for k in c.values()]):
|
||||||
|
|
||||||
if not channel and c["screenname"] != '' and c["screenname"] is not None:
|
|
||||||
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
|
|
||||||
|
|
||||||
if not channel:
|
|
||||||
if all([k in [None, True, False, ''] for k in c.values()]):
|
|
||||||
# end sync if completely empty row is encountered
|
# end sync if completely empty row is encountered
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -109,7 +119,11 @@ def sync_channels(args, session):
|
|||||||
if c["screenname"]:
|
if c["screenname"]:
|
||||||
channel.screenname = c["screenname"]
|
channel.screenname = c["screenname"]
|
||||||
if c["country"]:
|
if c["country"]:
|
||||||
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
|
channel.country = (
|
||||||
|
None
|
||||||
|
if c["country"] is None
|
||||||
|
else list(map(standardize_country, c["country"].split("/")))
|
||||||
|
)
|
||||||
if c["influencer"]:
|
if c["influencer"]:
|
||||||
channel.influencer = c["influencer"]
|
channel.influencer = c["influencer"]
|
||||||
if c["public"]:
|
if c["public"]:
|
||||||
@@ -129,23 +143,27 @@ def sync_channels(args, session):
|
|||||||
|
|
||||||
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
|
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
|
||||||
if was_researcher:
|
if was_researcher:
|
||||||
logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.")
|
logger.warning(
|
||||||
|
f"This channel (ID {channel.id}) is possibly a duplicate."
|
||||||
|
)
|
||||||
|
|
||||||
wks.format(f"A{str(row)}:A{str(row)}", {
|
wks.format(
|
||||||
"backgroundColor": {
|
f"A{str(row)}:A{str(row)}",
|
||||||
"red": 1.0,
|
{"backgroundColor": {"red": 1.0, "green": 0.0, "blue": 0.0}},
|
||||||
"green": 0.0,
|
)
|
||||||
"blue": 0.0
|
|
||||||
}})
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
# channel has ID
|
# channel has ID
|
||||||
else:
|
else:
|
||||||
cid = int(c["id"])
|
cid = int(c["id"])
|
||||||
|
|
||||||
channel = session.query(Channel).filter_by(id=cid).first()
|
channel = session.query(Channel).filter_by(id=cid).first()
|
||||||
channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first()
|
channel_info = (
|
||||||
|
session.query(ChannelInfo)
|
||||||
|
.filter_by(channel=cid)
|
||||||
|
.order_by(ChannelInfo.date_archived.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Updating channel {channel}")
|
logger.info(f"Updating channel {channel}")
|
||||||
logger.info(f"Found info {channel_info}")
|
logger.info(f"Found info {channel_info}")
|
||||||
@@ -155,7 +173,11 @@ def sync_channels(args, session):
|
|||||||
channel.platform = c["platform"]
|
channel.platform = c["platform"]
|
||||||
channel.url = c["url"]
|
channel.url = c["url"]
|
||||||
channel.screenname = c["screenname"]
|
channel.screenname = c["screenname"]
|
||||||
channel.country = None if c["country"] is None else list(map(standardize_country, c["country"].split('/')))
|
channel.country = (
|
||||||
|
None
|
||||||
|
if c["country"] is None
|
||||||
|
else list(map(standardize_country, c["country"].split("/")))
|
||||||
|
)
|
||||||
channel.influencer = c["influencer"]
|
channel.influencer = c["influencer"]
|
||||||
channel.public = c["public"]
|
channel.public = c["public"]
|
||||||
channel.chat = c["chat"]
|
channel.chat = c["chat"]
|
||||||
@@ -167,7 +189,9 @@ def sync_channels(args, session):
|
|||||||
wks.update_cell(row, 7, channel_info.screenname)
|
wks.update_cell(row, 7, channel_info.screenname)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
if channel_info and str(channel.platform_id) != str(channel_info.platform_id):
|
if channel_info and str(channel.platform_id) != str(
|
||||||
|
channel_info.platform_id
|
||||||
|
):
|
||||||
channel.platform_id = channel_info.platform_id
|
channel.platform_id = channel_info.platform_id
|
||||||
wks.update_cell(row, 3, channel_info.platform_id)
|
wks.update_cell(row, 3, channel_info.platform_id)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|||||||
@@ -8,9 +8,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
api_id = os.environ['TELEGRAM_API_ID']
|
api_id = os.environ["TELEGRAM_API_ID"]
|
||||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
api_hash = os.environ["TELEGRAM_API_HASH"]
|
||||||
phone = os.environ['TELEGRAM_PHONE']
|
phone = os.environ["TELEGRAM_PHONE"]
|
||||||
telethon_session_name = args.telethon_session
|
telethon_session_name = args.telethon_session
|
||||||
|
|
||||||
if telethon_session_name is None:
|
if telethon_session_name is None:
|
||||||
|
|||||||
@@ -1,49 +1,51 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from sqlalchemy.sql import text
|
from sqlalchemy.sql import text
|
||||||
|
|
||||||
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
|
from cisticola.base import (
|
||||||
|
Post,
|
||||||
|
Channel,
|
||||||
|
ChannelInfo,
|
||||||
|
Media,
|
||||||
|
ScraperResult,
|
||||||
|
RawChannelInfo,
|
||||||
|
)
|
||||||
from cisticola.scraper import (
|
from cisticola.scraper import (
|
||||||
TelegramTelethonScraper,
|
TelegramTelethonScraper,
|
||||||
BitchuteScraper,
|
BitchuteScraper,
|
||||||
GettrScraper,
|
GettrScraper,
|
||||||
RumbleScraper)
|
RumbleScraper,
|
||||||
|
)
|
||||||
from cisticola.transformer import (
|
from cisticola.transformer import (
|
||||||
TelegramTelethonTransformer,
|
TelegramTelethonTransformer,
|
||||||
BitchuteTransformer,
|
BitchuteTransformer,
|
||||||
GettrTransformer,
|
GettrTransformer,
|
||||||
RumbleTransformer)
|
RumbleTransformer,
|
||||||
|
)
|
||||||
|
|
||||||
CONTROLLERS = {
|
CONTROLLERS = {
|
||||||
'telegram' : {
|
"telegram": {
|
||||||
'scraper': TelegramTelethonScraper,
|
"scraper": TelegramTelethonScraper,
|
||||||
'transformer': TelegramTelethonTransformer
|
"transformer": TelegramTelethonTransformer,
|
||||||
},
|
},
|
||||||
'bitchute': {
|
"bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
|
||||||
'scraper': BitchuteScraper,
|
"gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
|
||||||
'transformer': BitchuteTransformer
|
"rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
|
||||||
},
|
|
||||||
'gettr': {
|
|
||||||
'scraper': GettrScraper,
|
|
||||||
'transformer': GettrTransformer
|
|
||||||
},
|
|
||||||
'rumble': {
|
|
||||||
'scraper': RumbleScraper,
|
|
||||||
'transformer': RumbleTransformer
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
|
@pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
|
||||||
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
|
def test_scraper_and_transformer(
|
||||||
|
platform, session, controller, etl_controller, channel_kwargs
|
||||||
|
):
|
||||||
controller.reset_db()
|
controller.reset_db()
|
||||||
controller.remove_all_scrapers()
|
controller.remove_all_scrapers()
|
||||||
|
|
||||||
# necessary for comments/replies to be processed correctly
|
# necessary for comments/replies to be processed correctly
|
||||||
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
|
session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
channels = [Channel(**channel_kwargs[platform])]
|
channels = [Channel(**channel_kwargs[platform])]
|
||||||
scraper = CONTROLLERS[platform]['scraper']
|
scraper = CONTROLLERS[platform]["scraper"]
|
||||||
controller.register_scraper(scraper=scraper())
|
controller.register_scraper(scraper=scraper())
|
||||||
|
|
||||||
controller.scrape_channels(channels=channels)
|
controller.scrape_channels(channels=channels)
|
||||||
@@ -52,7 +54,11 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
|
|||||||
|
|
||||||
raw_posts = session.query(ScraperResult).all()
|
raw_posts = session.query(ScraperResult).all()
|
||||||
raw_channel_info = session.query(RawChannelInfo).all()
|
raw_channel_info = session.query(RawChannelInfo).all()
|
||||||
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
|
archived_urls = (
|
||||||
|
session.query(ScraperResult.archived_urls)
|
||||||
|
.order_by(ScraperResult.date_archived.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
assert len(raw_posts) > 0
|
assert len(raw_posts) > 0
|
||||||
assert len(raw_channel_info) > 0
|
assert len(raw_channel_info) > 0
|
||||||
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
|
|||||||
|
|
||||||
controller.remove_all_scrapers()
|
controller.remove_all_scrapers()
|
||||||
|
|
||||||
transformer = CONTROLLERS[platform]['transformer']
|
transformer = CONTROLLERS[platform]["transformer"]
|
||||||
|
|
||||||
etl_controller.register_transformer(transformer())
|
etl_controller.register_transformer(transformer())
|
||||||
etl_controller.transform_all_untransformed()
|
etl_controller.transform_all_untransformed()
|
||||||
|
|||||||
@@ -8,161 +8,172 @@ from cisticola.scraper import ScraperController
|
|||||||
from cisticola.transformer import ETLController
|
from cisticola.transformer import ETLController
|
||||||
|
|
||||||
BITCHUTE_CHANNEL_KWARGS = {
|
BITCHUTE_CHANNEL_KWARGS = {
|
||||||
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
|
"name": "bestonlinejewelrystoresusa@gmail.com (test)",
|
||||||
'platform_id': 'bestonlinejewelrystoresusagmailcom',
|
"platform_id": "bestonlinejewelrystoresusagmailcom",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Bitchute',
|
"platform": "Bitchute",
|
||||||
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
|
"url": "https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/",
|
||||||
'screenname': None,
|
"screenname": None,
|
||||||
'country': 'US',
|
"country": "US",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
GAB_CHANNEL_KWARGS = {
|
GAB_CHANNEL_KWARGS = {
|
||||||
'name': 'Capt. Marc Simon (test)',
|
"name": "Capt. Marc Simon (test)",
|
||||||
'platform_id': 'marc_capt',
|
"platform_id": "marc_capt",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Gab',
|
"platform": "Gab",
|
||||||
'url': 'https://gab.com/marc_capt',
|
"url": "https://gab.com/marc_capt",
|
||||||
'screenname': 'marc_capt',
|
"screenname": "marc_capt",
|
||||||
'country': 'CA',
|
"country": "CA",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
GAB_GROUP_KWARGS = {
|
GAB_GROUP_KWARGS = {
|
||||||
'name': 'iran group (test)',
|
"name": "iran group (test)",
|
||||||
'platform_id': "10001",
|
"platform_id": "10001",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Gab',
|
"platform": "Gab",
|
||||||
'url': 'https://gab.com/groups/10001',
|
"url": "https://gab.com/groups/10001",
|
||||||
'screenname': 'iran group',
|
"screenname": "iran group",
|
||||||
'country': 'IR',
|
"country": "IR",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': True,
|
"chat": True,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
GETTR_CHANNEL_KWARGS = {
|
GETTR_CHANNEL_KWARGS = {
|
||||||
'name': 'LizardRepublic (test)',
|
"name": "LizardRepublic (test)",
|
||||||
'platform_id': 'lizardrepublic',
|
"platform_id": "lizardrepublic",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Gettr',
|
"platform": "Gettr",
|
||||||
'url': 'https://www.gettr.com/user/lizardrepublic',
|
"url": "https://www.gettr.com/user/lizardrepublic",
|
||||||
'screenname': 'lizardrepublic',
|
"screenname": "lizardrepublic",
|
||||||
'country': 'US',
|
"country": "US",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
INSTAGRAM_CHANNEL_KWARGS = {
|
INSTAGRAM_CHANNEL_KWARGS = {
|
||||||
'name': 'borland.88 (test)',
|
"name": "borland.88 (test)",
|
||||||
'platform_id': 'borland.88',
|
"platform_id": "borland.88",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Instagram',
|
"platform": "Instagram",
|
||||||
'url': 'https://www.instagram.com/borland.88/',
|
"url": "https://www.instagram.com/borland.88/",
|
||||||
'screenname': 'borland.88',
|
"screenname": "borland.88",
|
||||||
'country': 'UA',
|
"country": "UA",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
ODYSEE_CHANNEL_KWARGS = {
|
ODYSEE_CHANNEL_KWARGS = {
|
||||||
'name': "Mak1n' Bacon (test)",
|
"name": "Mak1n' Bacon (test)",
|
||||||
'platform_id': 'Mak1nBacon',
|
"platform_id": "Mak1nBacon",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Odysee',
|
"platform": "Odysee",
|
||||||
'url': 'https://odysee.com/@Mak1nBacon',
|
"url": "https://odysee.com/@Mak1nBacon",
|
||||||
'screenname': 'Mak1nBacon',
|
"screenname": "Mak1nBacon",
|
||||||
'country': 'US',
|
"country": "US",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
RUMBLE_CHANNEL_KWARGS = {
|
RUMBLE_CHANNEL_KWARGS = {
|
||||||
'name': 'we are uploading videos wow products (test)',
|
"name": "we are uploading videos wow products (test)",
|
||||||
'platform_id': 'c-916305',
|
"platform_id": "c-916305",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Rumble',
|
"platform": "Rumble",
|
||||||
'url': 'https://rumble.com/c/c-916305',
|
"url": "https://rumble.com/c/c-916305",
|
||||||
'screenname': 'we are uploading',
|
"screenname": "we are uploading",
|
||||||
'country': 'CA',
|
"country": "CA",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
TELEGRAM_CHANNEL_KWARGS = {
|
TELEGRAM_CHANNEL_KWARGS = {
|
||||||
'name': 'Бутылка (test)',
|
"name": "Бутылка (test)",
|
||||||
'platform_id': "-1001760492118",
|
"platform_id": "-1001760492118",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Telegram',
|
"platform": "Telegram",
|
||||||
'url': 'https://t.me/butylka1488',
|
"url": "https://t.me/butylka1488",
|
||||||
'screenname': 'butylka1488',
|
"screenname": "butylka1488",
|
||||||
'country': 'RU',
|
"country": "RU",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
TWITTER_CHANNEL_KWARGS = {
|
TWITTER_CHANNEL_KWARGS = {
|
||||||
'name': 'L Weber (test)',
|
"name": "L Weber (test)",
|
||||||
'platform_id': "1424979017749442595",
|
"platform_id": "1424979017749442595",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Twitter',
|
"platform": "Twitter",
|
||||||
'url': 'https://twitter.com/LWeber33662141',
|
"url": "https://twitter.com/LWeber33662141",
|
||||||
'screenname': 'LWeber33662141',
|
"screenname": "LWeber33662141",
|
||||||
'country': 'US',
|
"country": "US",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
VKONTAKTE_CHANNEL_KWARGS = {
|
VKONTAKTE_CHANNEL_KWARGS = {
|
||||||
'name': 'Wwg1wgA (test)',
|
"name": "Wwg1wgA (test)",
|
||||||
'platform_id': 'club201278078',
|
"platform_id": "club201278078",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Vkontakte',
|
"platform": "Vkontakte",
|
||||||
'url': 'https://vk.com/club201278078',
|
"url": "https://vk.com/club201278078",
|
||||||
'screenname': 'Wwg1wgA',
|
"screenname": "Wwg1wgA",
|
||||||
'country': 'FR',
|
"country": "FR",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
YOUTUBE_CHANNEL_KWARGS = {
|
YOUTUBE_CHANNEL_KWARGS = {
|
||||||
'name': 'AnEs87 (test)',
|
"name": "AnEs87 (test)",
|
||||||
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
|
"platform_id": "UCP6exBqGoxGLv_pM9Dxk2pA",
|
||||||
'category': 'test',
|
"category": "test",
|
||||||
'platform': 'Youtube',
|
"platform": "Youtube",
|
||||||
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
|
"url": "https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA",
|
||||||
'screenname': 'AnEs87',
|
"screenname": "AnEs87",
|
||||||
'country': 'SV',
|
"country": "SV",
|
||||||
'influencer': None,
|
"influencer": None,
|
||||||
'public': True,
|
"public": True,
|
||||||
'chat': False,
|
"chat": False,
|
||||||
'notes': '',
|
"notes": "",
|
||||||
'source': 'researcher'}
|
"source": "researcher",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='package')
|
@pytest.fixture(scope="package")
|
||||||
def engine(tmpdir_factory):
|
def engine(tmpdir_factory):
|
||||||
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
|
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
|
||||||
tests in the package"""
|
tests in the package"""
|
||||||
@@ -171,7 +182,8 @@ def engine(tmpdir_factory):
|
|||||||
|
|
||||||
return engine
|
return engine
|
||||||
|
|
||||||
@pytest.fixture(scope='package')
|
|
||||||
|
@pytest.fixture(scope="package")
|
||||||
def session(engine):
|
def session(engine):
|
||||||
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
|
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
|
||||||
|
|
||||||
@@ -179,7 +191,8 @@ def session(engine):
|
|||||||
sessionfactory.configure(bind=engine)
|
sessionfactory.configure(bind=engine)
|
||||||
return sessionfactory()
|
return sessionfactory()
|
||||||
|
|
||||||
@pytest.fixture(scope='package')
|
|
||||||
|
@pytest.fixture(scope="package")
|
||||||
def controller(engine):
|
def controller(engine):
|
||||||
"""Initialize ScraperController to be used for all tests in the package."""
|
"""Initialize ScraperController to be used for all tests in the package."""
|
||||||
|
|
||||||
@@ -188,7 +201,8 @@ def controller(engine):
|
|||||||
|
|
||||||
return scraper_controller
|
return scraper_controller
|
||||||
|
|
||||||
@pytest.fixture(scope='package')
|
|
||||||
|
@pytest.fixture(scope="package")
|
||||||
def etl_controller(engine):
|
def etl_controller(engine):
|
||||||
"""Initialize ETLController to be used for all tests in the package."""
|
"""Initialize ETLController to be used for all tests in the package."""
|
||||||
|
|
||||||
@@ -197,21 +211,23 @@ def etl_controller(engine):
|
|||||||
|
|
||||||
return etl_controller
|
return etl_controller
|
||||||
|
|
||||||
@pytest.fixture(scope='package')
|
|
||||||
|
@pytest.fixture(scope="package")
|
||||||
def channel_kwargs():
|
def channel_kwargs():
|
||||||
"""Define keyword arguments to use for defining test channels for each
|
"""Define keyword arguments to use for defining test channels for each
|
||||||
platform to be scraped.
|
platform to be scraped.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
|
"bitchute": BITCHUTE_CHANNEL_KWARGS,
|
||||||
'gab' : GAB_CHANNEL_KWARGS,
|
"gab": GAB_CHANNEL_KWARGS,
|
||||||
'gab_group' : GAB_GROUP_KWARGS,
|
"gab_group": GAB_GROUP_KWARGS,
|
||||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
"gettr": GETTR_CHANNEL_KWARGS,
|
||||||
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
|
"instagram": INSTAGRAM_CHANNEL_KWARGS,
|
||||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
"odysee": ODYSEE_CHANNEL_KWARGS,
|
||||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
"rumble": RUMBLE_CHANNEL_KWARGS,
|
||||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
"telegram": TELEGRAM_CHANNEL_KWARGS,
|
||||||
'twitter' : TWITTER_CHANNEL_KWARGS,
|
"twitter": TWITTER_CHANNEL_KWARGS,
|
||||||
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
|
"vkontakte": VKONTAKTE_CHANNEL_KWARGS,
|
||||||
'youtube' : YOUTUBE_CHANNEL_KWARGS}
|
"youtube": YOUTUBE_CHANNEL_KWARGS,
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user