This commit is contained in:
Logan Williams
2022-06-08 08:11:34 +00:00
11 changed files with 1833 additions and 419 deletions

16
Pipfile
View File

@@ -24,6 +24,10 @@ ratelimit = "*"
pytz = "*"
langdetect = "*"
spacy = "==3.2.4"
ocrd-pyexiftool = "*"
gabber = {git = "https://github.com/stanfordio/gabber.git"}
snscrape = {git = "https://github.com/bellingcat/snscrape"}
polyphemus = {git = "https://github.com/bellingcat/polyphemus"}
[dev-packages]
pytest = "*"
@@ -39,15 +43,3 @@ python_version = "3.9"
[pipenv]
allow_prereleases = true
[packages.polyphemus]
git = "https://github.com/bellingcat/polyphemus.git"
[packages.PyExifTool]
git = "https://github.com/smarnach/pyexiftool.git"
[packages.gabber]
git = "https://github.com/stanfordio/gabber.git"
[packages.snscrape]
git = "https://github.com/bellingcat/snscrape"

1452
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

96
app.py
View File

@@ -1,13 +1,11 @@
import argparse
from loguru import logger
import gspread
from sqlalchemy import create_engine, func
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os
import time
import sys
from cisticola.base import Channel, mapper_registry
from cisticola.base import mapper_registry
from cisticola.scraper import (
ScraperController,
VkontakteScraper,
@@ -17,85 +15,7 @@ from cisticola.scraper import (
RumbleScraper,
)
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
def sync_channels(args):
logger.info("Synchronizing channels")
session = get_db_session()
gc = gspread.service_account(filename="service_account.json")
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url(args.gsheet).worksheet("channels")
channels = wks.get_all_records()
row = 2
for c in channels:
if c["public"] == "":
c["public"] = False
if c["chat"] == "":
c["chat"] = False
for k in c.keys():
if c[k] == "TRUE" or c[k] == "yes":
c[k] = True
if c[k] == "FALSE" or c[k] == "no":
c[k] = False
if c[k] == "":
c[k] = None
del c["followers"]
# add new channel
if c["id"] == "" or c["id"] is None:
del c["id"]
# check to see if this already exists,
platform_id = None
if c["platform_id"] != "":
platform_id = c["platform_id"]
channel = (
session.query(Channel)
.filter_by(
platform_id=str(platform_id), platform=c["platform"], url=c["url"]
)
.first()
)
if not channel:
channel = Channel(**c, source="researcher")
logger.debug(f"{channel} does not exist, adding")
session.add(channel)
session.flush()
session.commit()
wks.update_cell(row, 1, channel.id)
time.sleep(1)
else:
channel = session.query(Channel).filter_by(id=int(c["id"])).first()
logger.info(f"Updating channel {channel}")
channel.name = c["name"]
channel.category = c["category"]
channel.platform = c["platform"]
channel.url = c["url"]
channel.screenname = c["screenname"]
channel.country = c["country"]
channel.influencer = c["influencer"]
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
session.flush()
session.commit()
row += 1
session.commit()
from sync_with_gsheet import sync_channels
def get_db_session():
engine = create_engine(os.environ["DB"])
@@ -162,6 +82,11 @@ def transform(args):
controller = get_transformer_controller()
controller.transform_all_untransformed()
def transform_info(args):
logger.info(f"Transforming untransformed channel info")
controller = get_transformer_controller()
controller.transform_all_untransformed_info()
def init_db():
engine = create_engine(os.environ["DB"])
@@ -191,7 +116,7 @@ if __name__ == "__main__":
init_db()
elif args.command == "sync-channels":
logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB")
sync_channels(args)
sync_channels(args, get_db_session())
elif args.command == "scrape-channels":
logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB")
scrape_channels(args)
@@ -204,5 +129,8 @@ if __name__ == "__main__":
elif args.command == "transform":
logger.add("logs/transform.log", level="TRACE", rotation="100 MB")
transform(args)
elif args.command == "transform-info":
logger.add("logs/transform-info.log", level="TRACE", rotation="100 MB")
transform_info(args)
else:
logger.error(f"Unrecognized command {args.command}")

View File

@@ -114,6 +114,49 @@ class RawChannelInfo:
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class ChannelInfo:
"""A processed set of information about a channel.
"""
# Foreign key from the raw_channel_info table
raw_channel_info_id: int
# Foreign ckey from the channels table
channel: int
# platform specific ID of the channel
platform_id: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: attributes extracted from the raw channel info object
screenname: str
name: str
description: str
description_url: str
description_location: str
followers: int
following: int
verified: bool
date_created: datetime
#: Datetime (relative to UTC) that the scraped channel info was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped channel info was transformed at.
date_transformed: datetime
def hydrate(self):
pass
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
@@ -149,6 +192,9 @@ class Post:
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped post was transformed at.
date_transformed: datetime
#: URL of the original post
url: str
@@ -189,20 +235,21 @@ class Post:
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
urls = re.findall(URL_REGEX, self.content)
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks = urls
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags = hashtags
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
try:
self.detected_language = detect(self.content)
except LangDetectException:
@@ -330,7 +377,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('platform_id', String, index=True),
Column('date', DateTime, index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True),
@@ -345,6 +392,28 @@ raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('raw_data', String),
Column('date_archived', DateTime, index=True))
channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String),
Column('scraper', String),
Column('platform', String),
Column('transformer', String),
Column('platform', String),
Column('screenname', String),
Column('name', String),
Column('description', String),
Column('description_url', String),
Column('description_location', String),
Column('followers', Integer),
Column('following', Integer),
Column('verified', Boolean),
Column('date_created', DateTime),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
)
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
@@ -365,19 +434,20 @@ post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', Integer),
Column('platform_id', Integer, index=True),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
Column('reply_to', Integer, ForeignKey('posts.id'), index=True),
Column('reply_to', Integer, ForeignKey('posts.id', ondelete="CASCADE"), index=True),
Column('named_entities', JSON),
Column('cryptocurrency_addresses', JSON),
Column('hashtags', JSON),
@@ -401,6 +471,7 @@ mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -6,8 +6,8 @@ from urllib.parse import urlparse
import requests
from loguru import logger
from polyphemus.base import OdyseeChannel
from polyphemus.api import get_auth_token
from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
from polyphemus.api import get_auth_token, get_all_comments
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
@@ -29,40 +29,43 @@ class OdyseeScraper(Scraper):
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
all_videos = odysee_channel.get_all_videos()
all_videos = scraper.get_all_videos()
for video in all_videos:
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
break
url = video.info['streaming_url']
url = video.streaming_url
if url is None:
archived_urls = {}
else:
archived_urls = {url: None}
archived_urls = {url: None}
if archive_media:
if archive_media:
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
# Check if file is a video file or an m3u8 file
r = requests.head(url)
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
media_blob, content_type, key = self.m3u8_url_to_blob(url)
else:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
all_comments = video.get_all_comments()
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
yield ScraperResult(
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=video.info['claim_id'],
date=datetime.fromtimestamp(video.info['created']),
platform_id=video.claim_id,
date=video.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(video.info),
raw_data=json.dumps(video.__dict__, default = str),
archived_urls=archived_urls,
media_archived=datetime.now(timezone.utc) if archive_media else None)
@@ -72,10 +75,10 @@ class OdyseeScraper(Scraper):
scraper=self.__version__,
platform="Odysee",
channel=channel.id,
platform_id=comment.info['claim_id'],
date=datetime.fromtimestamp(comment.info['created']),
platform_id=comment.claim_id,
date=comment.created.replace(tzinfo=timezone.utc),
date_archived=datetime.now(),
raw_data=json.dumps(comment.info),
raw_data=json.dumps(comment.__dict__, default = str),
archived_urls={},
media_archived=datetime.now(timezone.utc))
@@ -109,11 +112,11 @@ class OdyseeScraper(Scraper):
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = self.get_username_from_url(channel.url)
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
profile = odysee_channel.info
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
profile = scraper.get_entity().__dict__
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))

View File

@@ -20,12 +20,24 @@ class TelegramTelethonScraper(Scraper):
"""An implementation of a Scraper for Telegram, using Telethon library"""
__version__ = "TelegramTelethonScraper 0.0.2"
def get_username_from_url(self, url):
def get_username_from_url(url):
username = url.split('https://t.me/')[1]
if username.startswith('s/'):
username = username.split('s/')[1]
return username
def get_channel_identifier(channel: Channel):
identifier = channel.platform_id
if identifier is None:
identifier = channel.screenname
if identifier is None:
identifier = TelegramTelethonScraper.get_username_from_url(channel.url)
else:
identifier = int(identifier)
return identifier
@logger.catch
def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
if len(result.archived_urls.keys()) == 0:
@@ -108,14 +120,12 @@ class TelegramTelethonScraper(Scraper):
return (blob, output_file_with_ext)
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public:
if channel.platform == "Telegram":
return True
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = channel.screenname
if username is None:
username = self.get_username_from_url(channel.url)
username = TelegramTelethonScraper.get_channel_identifier(channel)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
@@ -156,9 +166,7 @@ class TelegramTelethonScraper(Scraper):
@logger.catch
def get_profile(self, channel: Channel) -> RawChannelInfo:
username = channel.screenname
if username is None:
username = self.get_username_from_url(channel.url)
username = TelegramTelethonScraper.get_channel_identifier(channel)
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']

View File

@@ -2,9 +2,11 @@ from typing import List, Generator, Union, Callable
from loguru import logger
from sqlalchemy.orm import sessionmaker, make_transient
from sqlalchemy.engine.base import Engine
from sqlalchemy.sql.expression import func
from collections import defaultdict
from datetime import datetime
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry
class Transformer:
@@ -94,7 +96,7 @@ class ETLController:
# This is using some adhoc unique constraints that might be worth formalizing at some point
if type(obj) == Channel:
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
elif type(obj) == Post:
instance = None
@@ -130,9 +132,9 @@ class ETLController:
if hydrate:
obj.hydrate()
logger.info(f"Inserting new object {obj}")
session.add(obj)
session.flush()
logger.trace(f"Inserted new object {obj}")
return obj
@logger.catch(reraise=True)
@@ -151,21 +153,24 @@ class ETLController:
logger.error("No DB session")
return
session = self.session()
for result in results:
for transformer in self.transformers:
handled = False
if result.scraper is not None and result.platform is not None:
for transformer in self.transformers:
handled = False
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result}")
handled = True
session = self.session()
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
handled = True
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
session.commit()
break
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
if handled == False:
logger.warning(f"No Transformer could handle {result}")
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):
@@ -183,13 +188,77 @@ class ETLController:
return
session = self.session()
untransformed = (
session.query(ScraperResult)
BATCH_SIZE = 50000
offset = 0
batch = []
query = (session.query(ScraperResult)
.join(Post, isouter=True)
.where(Post.raw_id == None)
.order_by(ScraperResult.date.asc())
.all()
)
logger.info(f"Found {len(untransformed)} items to ETL")
self.transform_results(untransformed, hydrate=hydrate)
while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
batch = query.slice(offset, offset + BATCH_SIZE).all()
offset += BATCH_SIZE
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
self.transform_results(batch, hydrate=hydrate)
@logger.catch(reraise=True)
def transform_info(self, results: List[ChannelInfo]):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
for result in results:
if result.scraper is not None and result.platform is not None:
for transformer in self.transformers:
handled = False
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
handled = True
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session)
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
@logger.catch(reraise=True)
def transform_all_untransformed_info(self):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
BATCH_SIZE = 50000
offset = 0
batch = []
query = (session.query(RawChannelInfo)
.join(ChannelInfo, isouter=True)
.where(ChannelInfo.raw_channel_info_id == None)
.order_by(RawChannelInfo.date_archived.asc())
)
while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}")
batch = query.slice(offset, offset + BATCH_SIZE).all()
offset += BATCH_SIZE
logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)")
self.transform_info(batch)

View File

@@ -3,15 +3,20 @@ from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from bs4 import BeautifulSoup
from psycopg2 import DatabaseError
import requests
import time
from telethon.sync import TelegramClient
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
import os
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TelegramTelethonTransformer(Transformer):
__version__ = 'TelegramTelethonTransformer 0.0.1'
__version__ = 'TelegramTelethonTransformer 0.0.2'
bad_channels = {}
@@ -22,39 +27,93 @@ class TelegramTelethonTransformer(Transformer):
return False
def get_screenname_from_id(self, orig_screenname, id):
def get_screenname_from_id(self, channel_id):
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
try:
with TelegramClient("transform.session", api_id, api_hash) as client:
data = client.get_entity(channel_id)
return (data.username, data.title, "")
except ChannelPrivateError:
logger.info("ChannelPrivateError")
return ("", "", "ChannelPrivateError")
except ChannelInvalidError:
logger.info("ChannelInvalidError")
return ("", "", "ChannelInvalidError")
except ValueError:
logger.info("ValueError")
return ("", "", "ValueError")
def get_name_from_web_interface(self, orig_screenname, id):
url = "https://t.me/s/" + orig_screenname + "/" + str(id)
# this doesn't work for chat channels
if orig_screenname in self.bad_channels:
logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}")
return ("", "")
url = "https://t.me/s/" + orig_screenname + "/" + str(id)
return ""
logger.info(f"Finding channel from URL {url}")
r = requests.get(url)
if r.url != url:
self.bad_channels[orig_screenname] = True
return ("", "")
return ""
soup = BeautifulSoup(r.content)
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
name = ""
# multiple posts can be combined into one result in the web interface
decrement = 0
while len(post) == 0:
decrement += 1
if decrement > 8:
break
logger.info(f"Could not find post from {url}, looking for id {id - decrement}")
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)})
if len(post) == 0:
logger.warning(f"Could not find post from {url}")
screenname = ""
name = ""
else:
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})
if len(fwd_tag) > 0:
fwd_tag = fwd_tag[0]
name = fwd_tag.text
screenname = fwd_tag['href'].split('/')[-2]
else:
if len(fwd_tag) == 0:
fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"})
if len(fwd_tag) >= 1:
name = fwd_tag[0].text
screenname = ""
return (screenname, name)
return name
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
chat_raw = raw['chats'][0]
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['full_chat']['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=chat_raw['username'],
name=chat_raw['title'],
description=raw['full_chat']['about'],
description_url='', # does not exist for Telegram
description_location='', # does not exist for Telegram
followers=raw['full_chat']['participants_count'],
following=-1, # does not exist for Telegram
verified=False, #does not exist for Telegram
date_created=dateutil.parser.parse(chat_raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
@@ -66,32 +125,30 @@ class TelegramTelethonTransformer(Transformer):
fwd_from = None
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
channel = session.query(Channel).filter_by(platform_id=raw['fwd_from']['from_id']['channel_id']).first()
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
if channel is None:
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
(screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
if name == "":
logger.info("Trying fallback web interface")
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
if orig_channel.screenname is not None:
name = self.get_name_from_web_interface(orig_channel.screenname, raw['id'])
channel = Channel(
name=name,
platform_id=raw['fwd_from']['from_id']['channel_id'],
platform=data.platform,
url="https://t.me/s/" + screenname,
url="https://t.me/s/" + screenname if screenname is not None else "",
screenname=screenname,
category='forwarded',
source=self.__version__
source=self.__version__,
notes=notes
)
channel = insert(channel)
elif channel.screenname == "":
# if the screenname is empty, we can fill it in
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
(screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
channel.screenname = screenname
channel.name = name
channel.url = "https://t.me/s/" + screenname
session.flush()
logger.info(f"Added {channel}")
fwd_from = channel.id
@@ -113,6 +170,7 @@ class TelegramTelethonTransformer(Transformer):
channel=data.channel,
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url="",
content=raw['message'],
author_id=raw['post_author'],
@@ -123,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
transformed = insert(transformed)
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
ext = archived_url.split('.')[-1]
# for k in data.archived_urls:
# if data.archived_urls[k]:
# archived_url = data.archived_urls[k]
# ext = archived_url.split('.')[-1]
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# else:
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))

View File

@@ -1,138 +0,0 @@
import sys
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.scraper import (
ScraperController,
TelegramSnscrapeScraper)
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add("../russian_telegram_ingest.log")
test_channels = [
Channel(
id=0,
name="QAnon Россия",
platform_id=-1001319637748,
category="Qanon",
followers=94048,
platform="Telegram",
url="https://t.me/qanonrus",
screenname="qanonrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="The Great Awakening | Q",
platform_id=-1001325597521,
category="Qanon",
followers=5715,
platform="Telegram",
url="https://t.me/greatawakin",
screenname="greatawakin",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="Великое Пробуждение",
platform_id=-1001285898079,
category="Qanon",
followers=5861,
platform="Telegram",
url="https://t.me/greatawakeningrus",
screenname="greatawakeningrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=3,
name="T🕊Редакция Президент Гордон🕊",
platform_id=-1001101170442,
category="Qanon",
followers=5743,
platform="Telegram",
url="https://t.me/prezidentgordonteam",
screenname="prezidentgordonteam",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="ПРОЕКТ АВРОРА",
platform_id=-1001279171101,
category="Qanon",
followers=5930,
platform="Telegram",
url="https://t.me/project_aurora",
screenname="project_aurora",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Сон Разума",
platform_id=-1001202338312,
category="Qanon",
followers=27099,
platform="Telegram",
url="https://t.me/error_288",
screenname="error_288",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Пробуждающий Мир - официальный канал",
platform_id=-1001492521207,
category="Qanon",
followers=19097,
platform="Telegram",
url="https://t.me/promirru",
screenname="promirru",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="ЦЕЛЬНОЗОР",
platform_id=-1001642737506,
category="Qanon",
followers=13654,
platform="Telegram",
url="https://t.me/tselnozor",
screenname="tselnozor",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),]
controller = ScraperController()
telegram = TelegramSnscrapeScraper()
controller.register_scraper(telegram)
engine = create_engine('sqlite:///russian_telegram.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels, archive_media = False)

9
spacy_setup.sh Normal file
View File

@@ -0,0 +1,9 @@
#!/bin/bash
pipenv run python -m spacy download xx_ent_wiki_sm
pipenv run python -m spacy download fr_core_news_sm
pipenv run python -m spacy download de_core_news_sm
pipenv run python -m spacy download nl_core_news_sm
pipenv run python -m spacy download it_core_news_sm
pipenv run python -m spacy download ru_core_news_sm
pipenv run python -m spacy download en_core_web_sm

134
sync_with_gsheet.py Normal file
View File

@@ -0,0 +1,134 @@
import gspread
import time
from loguru import logger
from cisticola.base import Channel, ChannelInfo
def sync_channels(args, session):
logger.info("Synchronizing channels")
gc = gspread.service_account(filename="service_account.json")
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url(args.gsheet).worksheet("channels")
channels = wks.get_all_records()
row = 2
for c in channels:
# defaults for unset values
if c["public"] == "":
c["public"] = True
if c["chat"] == "":
c["chat"] = False
# normalize the values slightly from the Google Sheet
for k in c.keys():
if c[k] == "TRUE" or c[k] == "yes":
c[k] = True
if c[k] == "FALSE" or c[k] == "no":
c[k] = False
if c[k] == "":
c[k] = None
# add new channel
if c["id"] == "" or c["id"] is None:
del c["id"]
# check to see if this already exists,
platform_id = None
if c["platform_id"] != "":
platform_id = c["platform_id"]
channel = (
session.query(Channel)
.filter_by(
platform_id=str(platform_id), platform=str(c["platform"])
)
.first()
)
if not channel:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first()
if not channel and c["screenname"] != '' and c["screenname"] is not None:
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
if not channel:
channel = Channel(**c, source="researcher")
logger.debug(f"{channel} does not exist, adding")
session.add(channel)
session.flush()
session.commit()
wks.update_cell(row, 1, channel.id)
time.sleep(1)
else:
logger.info(f"Channel found, updating channel {channel}")
was_researcher = channel.source == "researcher"
channel.name = c["name"]
channel.category = c["category"]
channel.platform = c["platform"]
channel.url = c["url"]
channel.screenname = c["screenname"]
channel.country = c["country"]
channel.influencer = c["influencer"]
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
channel.source = "researcher"
session.flush()
session.commit()
wks.update_cell(row, 1, channel.id)
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
if was_researcher:
logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.")
wks.format(f"A{str(row)}:A{str(row)}", {
"backgroundColor": {
"red": 1.0,
"green": 0.0,
"blue": 0.0
}})
time.sleep(1)
# channel has ID
else:
cid = int(c["id"])
channel = session.query(Channel).filter_by(id=cid).first()
channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first()
logger.info(f"Updating channel {channel}")
logger.info(channel_info)
channel.name = c["name"]
channel.category = c["category"]
channel.platform = c["platform"]
channel.url = c["url"]
channel.screenname = c["screenname"]
channel.country = c["country"]
channel.influencer = c["influencer"]
channel.public = c["public"]
channel.chat = c["chat"]
channel.notes = c["notes"]
channel.source = "researcher"
if channel_info:
channel.screenname = channel_info.screenname
wks.update_cell(row, 7, channel_info.screenname)
channel.platform_id = channel_info.platform_id
wks.update_cell(row, 3, channel_info.platform_id)
session.flush()
session.commit()
row += 1
session.commit()