mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge branch 'main' of https://github.com/bellingcat/cisticola into main
This commit is contained in:
16
Pipfile
16
Pipfile
@@ -24,6 +24,10 @@ ratelimit = "*"
|
||||
pytz = "*"
|
||||
langdetect = "*"
|
||||
spacy = "==3.2.4"
|
||||
ocrd-pyexiftool = "*"
|
||||
gabber = {git = "https://github.com/stanfordio/gabber.git"}
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape"}
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus"}
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
@@ -39,15 +43,3 @@ python_version = "3.9"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
||||
[packages.polyphemus]
|
||||
git = "https://github.com/bellingcat/polyphemus.git"
|
||||
|
||||
[packages.PyExifTool]
|
||||
git = "https://github.com/smarnach/pyexiftool.git"
|
||||
|
||||
[packages.gabber]
|
||||
git = "https://github.com/stanfordio/gabber.git"
|
||||
|
||||
[packages.snscrape]
|
||||
git = "https://github.com/bellingcat/snscrape"
|
||||
|
||||
1452
Pipfile.lock
generated
1452
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
96
app.py
96
app.py
@@ -1,13 +1,11 @@
|
||||
import argparse
|
||||
from loguru import logger
|
||||
import gspread
|
||||
from sqlalchemy import create_engine, func
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
|
||||
from cisticola.base import Channel, mapper_registry
|
||||
from cisticola.base import mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
VkontakteScraper,
|
||||
@@ -17,85 +15,7 @@ from cisticola.scraper import (
|
||||
RumbleScraper,
|
||||
)
|
||||
from cisticola.transformer import (ETLController, TelegramTelethonTransformer)
|
||||
|
||||
|
||||
def sync_channels(args):
|
||||
logger.info("Synchronizing channels")
|
||||
|
||||
session = get_db_session()
|
||||
|
||||
gc = gspread.service_account(filename="service_account.json")
|
||||
|
||||
# Open a sheet from a spreadsheet in one go
|
||||
wks = gc.open_by_url(args.gsheet).worksheet("channels")
|
||||
channels = wks.get_all_records()
|
||||
row = 2
|
||||
|
||||
for c in channels:
|
||||
if c["public"] == "":
|
||||
c["public"] = False
|
||||
if c["chat"] == "":
|
||||
c["chat"] = False
|
||||
|
||||
for k in c.keys():
|
||||
if c[k] == "TRUE" or c[k] == "yes":
|
||||
c[k] = True
|
||||
if c[k] == "FALSE" or c[k] == "no":
|
||||
c[k] = False
|
||||
|
||||
if c[k] == "":
|
||||
c[k] = None
|
||||
|
||||
del c["followers"]
|
||||
|
||||
# add new channel
|
||||
if c["id"] == "" or c["id"] is None:
|
||||
del c["id"]
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
if c["platform_id"] != "":
|
||||
platform_id = c["platform_id"]
|
||||
|
||||
channel = (
|
||||
session.query(Channel)
|
||||
.filter_by(
|
||||
platform_id=str(platform_id), platform=c["platform"], url=c["url"]
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source="researcher")
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
else:
|
||||
channel = session.query(Channel).filter_by(id=int(c["id"])).first()
|
||||
|
||||
logger.info(f"Updating channel {channel}")
|
||||
channel.name = c["name"]
|
||||
channel.category = c["category"]
|
||||
channel.platform = c["platform"]
|
||||
channel.url = c["url"]
|
||||
channel.screenname = c["screenname"]
|
||||
channel.country = c["country"]
|
||||
channel.influencer = c["influencer"]
|
||||
channel.public = c["public"]
|
||||
channel.chat = c["chat"]
|
||||
channel.notes = c["notes"]
|
||||
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
row += 1
|
||||
|
||||
session.commit()
|
||||
|
||||
from sync_with_gsheet import sync_channels
|
||||
|
||||
def get_db_session():
|
||||
engine = create_engine(os.environ["DB"])
|
||||
@@ -162,6 +82,11 @@ def transform(args):
|
||||
controller = get_transformer_controller()
|
||||
controller.transform_all_untransformed()
|
||||
|
||||
def transform_info(args):
|
||||
logger.info(f"Transforming untransformed channel info")
|
||||
|
||||
controller = get_transformer_controller()
|
||||
controller.transform_all_untransformed_info()
|
||||
|
||||
def init_db():
|
||||
engine = create_engine(os.environ["DB"])
|
||||
@@ -191,7 +116,7 @@ if __name__ == "__main__":
|
||||
init_db()
|
||||
elif args.command == "sync-channels":
|
||||
logger.add("logs/sync-channels.log", level="TRACE", rotation="100 MB")
|
||||
sync_channels(args)
|
||||
sync_channels(args, get_db_session())
|
||||
elif args.command == "scrape-channels":
|
||||
logger.add("logs/scrape-channels.log", level="TRACE", rotation="100 MB")
|
||||
scrape_channels(args)
|
||||
@@ -204,5 +129,8 @@ if __name__ == "__main__":
|
||||
elif args.command == "transform":
|
||||
logger.add("logs/transform.log", level="TRACE", rotation="100 MB")
|
||||
transform(args)
|
||||
elif args.command == "transform-info":
|
||||
logger.add("logs/transform-info.log", level="TRACE", rotation="100 MB")
|
||||
transform_info(args)
|
||||
else:
|
||||
logger.error(f"Unrecognized command {args.command}")
|
||||
|
||||
@@ -114,6 +114,49 @@ class RawChannelInfo:
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
@dataclass
|
||||
class ChannelInfo:
|
||||
"""A processed set of information about a channel.
|
||||
"""
|
||||
|
||||
# Foreign key from the raw_channel_info table
|
||||
raw_channel_info_id: int
|
||||
|
||||
# Foreign ckey from the channels table
|
||||
channel: int
|
||||
|
||||
# platform specific ID of the channel
|
||||
platform_id: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
|
||||
transformer: str
|
||||
|
||||
#: attributes extracted from the raw channel info object
|
||||
screenname: str
|
||||
name: str
|
||||
description: str
|
||||
description_url: str
|
||||
description_location: str
|
||||
followers: int
|
||||
following: int
|
||||
verified: bool
|
||||
date_created: datetime
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped channel info was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: Datetime (UTC) that the scraped channel info was transformed at.
|
||||
date_transformed: datetime
|
||||
|
||||
def hydrate(self):
|
||||
pass
|
||||
|
||||
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
||||
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
||||
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
|
||||
@@ -149,6 +192,9 @@ class Post:
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: Datetime (UTC) that the scraped post was transformed at.
|
||||
date_transformed: datetime
|
||||
|
||||
#: URL of the original post
|
||||
url: str
|
||||
@@ -189,20 +235,21 @@ class Post:
|
||||
def hydrate(self):
|
||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||
|
||||
urls = re.findall(URL_REGEX, self.content)
|
||||
# replace is here in order to prevent catastrophic backtracking
|
||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
||||
self.outlinks = urls
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags = hashtags
|
||||
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
||||
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
|
||||
|
||||
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
|
||||
|
||||
|
||||
try:
|
||||
self.detected_language = detect(self.content)
|
||||
except LangDetectException:
|
||||
@@ -330,7 +377,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('platform_id', String),
|
||||
Column('platform_id', String, index=True),
|
||||
Column('date', DateTime, index=True),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
@@ -345,6 +392,28 @@ raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime, index=True))
|
||||
|
||||
channel_info_table = Table('channel_info', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('platform_id', String),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('screenname', String),
|
||||
Column('name', String),
|
||||
Column('description', String),
|
||||
Column('description_url', String),
|
||||
Column('description_location', String),
|
||||
Column('followers', Integer),
|
||||
Column('following', Integer),
|
||||
Column('verified', Boolean),
|
||||
Column('date_created', DateTime),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
Column('date_transformed', DateTime, index=True),
|
||||
)
|
||||
|
||||
channel_table = Table('channels', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True, autoincrement=True),
|
||||
Column('name', String),
|
||||
@@ -365,19 +434,20 @@ post_table = Table('posts', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
|
||||
Column('platform_id', Integer),
|
||||
Column('platform_id', Integer, index=True),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('date', DateTime, index=True),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
Column('date_transformed', DateTime, index=True),
|
||||
Column('url', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String),
|
||||
Column('content', String),
|
||||
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
|
||||
Column('reply_to', Integer, ForeignKey('posts.id'), index=True),
|
||||
Column('reply_to', Integer, ForeignKey('posts.id', ondelete="CASCADE"), index=True),
|
||||
Column('named_entities', JSON),
|
||||
Column('cryptocurrency_addresses', JSON),
|
||||
Column('hashtags', JSON),
|
||||
@@ -401,6 +471,7 @@ mapper_registry.map_imperatively(Post, post_table)
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
|
||||
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
|
||||
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -6,8 +6,8 @@ from urllib.parse import urlparse
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
from polyphemus.api import get_auth_token
|
||||
from polyphemus.base import OdyseeChannelScraper, process_raw_comment_info
|
||||
from polyphemus.api import get_auth_token, get_all_comments
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
@@ -29,40 +29,43 @@ class OdyseeScraper(Scraper):
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
|
||||
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
|
||||
|
||||
all_videos = odysee_channel.get_all_videos()
|
||||
all_videos = scraper.get_all_videos()
|
||||
|
||||
for video in all_videos:
|
||||
if since is not None and datetime.fromtimestamp(video.info['created']) <= since.date:
|
||||
if since is not None and video.created.replace(tzinfo=timezone.utc) <= since.date:
|
||||
break
|
||||
|
||||
url = video.info['streaming_url']
|
||||
url = video.streaming_url
|
||||
if url is None:
|
||||
archived_urls = {}
|
||||
else:
|
||||
archived_urls = {url: None}
|
||||
|
||||
archived_urls = {url: None}
|
||||
if archive_media:
|
||||
|
||||
if archive_media:
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
|
||||
# Check if file is a video file or an m3u8 file
|
||||
r = requests.head(url)
|
||||
if r.headers['Content-Type'] == 'text/html; charset=utf-8':
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
else:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
all_comments = video.get_all_comments()
|
||||
raw_comment_info_list = get_all_comments(video_id=video.claim_id)
|
||||
all_comments = (process_raw_comment_info(raw_comment_info) for raw_comment_info in raw_comment_info_list)
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
platform_id=video.info['claim_id'],
|
||||
date=datetime.fromtimestamp(video.info['created']),
|
||||
platform_id=video.claim_id,
|
||||
date=video.created.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(video.info),
|
||||
raw_data=json.dumps(video.__dict__, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
@@ -72,10 +75,10 @@ class OdyseeScraper(Scraper):
|
||||
scraper=self.__version__,
|
||||
platform="Odysee",
|
||||
channel=channel.id,
|
||||
platform_id=comment.info['claim_id'],
|
||||
date=datetime.fromtimestamp(comment.info['created']),
|
||||
platform_id=comment.claim_id,
|
||||
date=comment.created.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(comment.info),
|
||||
raw_data=json.dumps(comment.__dict__, default = str),
|
||||
archived_urls={},
|
||||
media_archived=datetime.now(timezone.utc))
|
||||
|
||||
@@ -109,11 +112,11 @@ class OdyseeScraper(Scraper):
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
odysee_channel = OdyseeChannel(channel_name = username, auth_token = self.auth_token)
|
||||
profile = odysee_channel.info
|
||||
scraper = OdyseeChannelScraper(channel_name = username, auth_token = self.auth_token)
|
||||
profile = scraper.get_entity().__dict__
|
||||
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
raw_data=json.dumps(profile, default = str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
@@ -20,12 +20,24 @@ class TelegramTelethonScraper(Scraper):
|
||||
"""An implementation of a Scraper for Telegram, using Telethon library"""
|
||||
__version__ = "TelegramTelethonScraper 0.0.2"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
def get_username_from_url(url):
|
||||
username = url.split('https://t.me/')[1]
|
||||
if username.startswith('s/'):
|
||||
username = username.split('s/')[1]
|
||||
return username
|
||||
|
||||
def get_channel_identifier(channel: Channel):
|
||||
identifier = channel.platform_id
|
||||
|
||||
if identifier is None:
|
||||
identifier = channel.screenname
|
||||
if identifier is None:
|
||||
identifier = TelegramTelethonScraper.get_username_from_url(channel.url)
|
||||
else:
|
||||
identifier = int(identifier)
|
||||
|
||||
return identifier
|
||||
|
||||
@logger.catch
|
||||
def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
|
||||
if len(result.archived_urls.keys()) == 0:
|
||||
@@ -108,14 +120,12 @@ class TelegramTelethonScraper(Scraper):
|
||||
return (blob, output_file_with_ext)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public:
|
||||
if channel.platform == "Telegram":
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -156,9 +166,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
@logger.catch
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
username = self.get_username_from_url(channel.url)
|
||||
username = TelegramTelethonScraper.get_channel_identifier(channel)
|
||||
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
|
||||
@@ -2,9 +2,11 @@ from typing import List, Generator, Union, Callable
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import sessionmaker, make_transient
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.sql.expression import func
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
from cisticola.base import ScraperResult, Post, Media, Channel, mapper_registry
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry
|
||||
|
||||
|
||||
class Transformer:
|
||||
@@ -94,7 +96,7 @@ class ETLController:
|
||||
|
||||
# This is using some adhoc unique constraints that might be worth formalizing at some point
|
||||
if type(obj) == Channel:
|
||||
instance = session.query(Channel).filter_by(url=obj.url, platform_id=obj.platform_id, platform=obj.platform).first()
|
||||
instance = session.query(Channel).filter_by(url=obj.url, platform_id=str(obj.platform_id), platform=obj.platform).first()
|
||||
|
||||
elif type(obj) == Post:
|
||||
instance = None
|
||||
@@ -130,9 +132,9 @@ class ETLController:
|
||||
if hydrate:
|
||||
obj.hydrate()
|
||||
|
||||
logger.info(f"Inserting new object {obj}")
|
||||
session.add(obj)
|
||||
session.flush()
|
||||
logger.trace(f"Inserted new object {obj}")
|
||||
|
||||
return obj
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
@@ -151,21 +153,24 @@ class ETLController:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
for result in results:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
if result.scraper is not None and result.platform is not None:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
|
||||
if transformer.can_handle(result):
|
||||
logger.trace(f"{transformer} is handling result {result}")
|
||||
handled = True
|
||||
session = self.session()
|
||||
if transformer.can_handle(result):
|
||||
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
|
||||
handled = True
|
||||
|
||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
|
||||
session.commit()
|
||||
break
|
||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
@@ -183,13 +188,77 @@ class ETLController:
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
untransformed = (
|
||||
session.query(ScraperResult)
|
||||
|
||||
BATCH_SIZE = 50000
|
||||
offset = 0
|
||||
batch = []
|
||||
|
||||
query = (session.query(ScraperResult)
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
.order_by(ScraperResult.date.asc())
|
||||
.all()
|
||||
)
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
while len(batch) > 0 or offset == 0:
|
||||
logger.info(f"Fetching untransformed posts batch of {BATCH_SIZE}, offset {offset}")
|
||||
|
||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
||||
offset += BATCH_SIZE
|
||||
|
||||
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
|
||||
|
||||
self.transform_results(batch, hydrate=hydrate)
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_info(self, results: List[ChannelInfo]):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
for result in results:
|
||||
if result.scraper is not None and result.platform is not None:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
|
||||
if transformer.can_handle(result):
|
||||
logger.trace(f"{transformer} is handling raw info result {result.id} ({result.date_archived})")
|
||||
handled = True
|
||||
|
||||
transformer.transform_info(result, lambda obj: self.insert_or_select(obj, session, False), session)
|
||||
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle raw channel info ID {result.id} with platform {result.platform} ({result.date_archived})")
|
||||
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed_info(self):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
|
||||
BATCH_SIZE = 50000
|
||||
offset = 0
|
||||
batch = []
|
||||
|
||||
query = (session.query(RawChannelInfo)
|
||||
.join(ChannelInfo, isouter=True)
|
||||
.where(ChannelInfo.raw_channel_info_id == None)
|
||||
.order_by(RawChannelInfo.date_archived.asc())
|
||||
)
|
||||
|
||||
while len(batch) > 0 or offset == 0:
|
||||
logger.info(f"Fetching untransformed info batch of {BATCH_SIZE}, offset {offset}")
|
||||
|
||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
||||
offset += BATCH_SIZE
|
||||
|
||||
logger.info(f"Found {len(batch)} info items to ETL ({offset} already processed)")
|
||||
|
||||
self.transform_info(batch)
|
||||
|
||||
@@ -3,15 +3,20 @@ from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from bs4 import BeautifulSoup
|
||||
from psycopg2 import DatabaseError
|
||||
import requests
|
||||
import time
|
||||
from telethon.sync import TelegramClient
|
||||
from telethon.errors.rpcerrorlist import ChannelPrivateError, ChannelInvalidError
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
|
||||
class TelegramTelethonTransformer(Transformer):
|
||||
__version__ = 'TelegramTelethonTransformer 0.0.1'
|
||||
__version__ = 'TelegramTelethonTransformer 0.0.2'
|
||||
|
||||
bad_channels = {}
|
||||
|
||||
@@ -22,39 +27,93 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def get_screenname_from_id(self, orig_screenname, id):
|
||||
def get_screenname_from_id(self, channel_id):
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
|
||||
try:
|
||||
with TelegramClient("transform.session", api_id, api_hash) as client:
|
||||
data = client.get_entity(channel_id)
|
||||
|
||||
return (data.username, data.title, "")
|
||||
except ChannelPrivateError:
|
||||
logger.info("ChannelPrivateError")
|
||||
return ("", "", "ChannelPrivateError")
|
||||
except ChannelInvalidError:
|
||||
logger.info("ChannelInvalidError")
|
||||
return ("", "", "ChannelInvalidError")
|
||||
except ValueError:
|
||||
logger.info("ValueError")
|
||||
return ("", "", "ValueError")
|
||||
|
||||
def get_name_from_web_interface(self, orig_screenname, id):
|
||||
url = "https://t.me/s/" + orig_screenname + "/" + str(id)
|
||||
|
||||
# this doesn't work for chat channels
|
||||
if orig_screenname in self.bad_channels:
|
||||
logger.debug(f"Skipping screenname because it is not accessible for channel {orig_screenname}")
|
||||
return ("", "")
|
||||
|
||||
url = "https://t.me/s/" + orig_screenname + "/" + str(id)
|
||||
return ""
|
||||
|
||||
logger.info(f"Finding channel from URL {url}")
|
||||
r = requests.get(url)
|
||||
|
||||
if r.url != url:
|
||||
self.bad_channels[orig_screenname] = True
|
||||
return ("", "")
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(r.content)
|
||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
|
||||
name = ""
|
||||
|
||||
# multiple posts can be combined into one result in the web interface
|
||||
decrement = 0
|
||||
while len(post) == 0:
|
||||
decrement += 1
|
||||
if decrement > 8:
|
||||
break
|
||||
|
||||
logger.info(f"Could not find post from {url}, looking for id {id - decrement}")
|
||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id - decrement)})
|
||||
|
||||
if len(post) == 0:
|
||||
logger.warning(f"Could not find post from {url}")
|
||||
screenname = ""
|
||||
name = ""
|
||||
else:
|
||||
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})
|
||||
|
||||
if len(fwd_tag) > 0:
|
||||
fwd_tag = fwd_tag[0]
|
||||
name = fwd_tag.text
|
||||
screenname = fwd_tag['href'].split('/')[-2]
|
||||
else:
|
||||
if len(fwd_tag) == 0:
|
||||
fwd_tag = post[0].findAll("span", {"class", "tgme_widget_message_forwarded_from_name"})
|
||||
|
||||
if len(fwd_tag) >= 1:
|
||||
name = fwd_tag[0].text
|
||||
screenname = ""
|
||||
|
||||
return (screenname, name)
|
||||
return name
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
chat_raw = raw['chats'][0]
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['full_chat']['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=chat_raw['username'],
|
||||
name=chat_raw['title'],
|
||||
description=raw['full_chat']['about'],
|
||||
description_url='', # does not exist for Telegram
|
||||
description_location='', # does not exist for Telegram
|
||||
followers=raw['full_chat']['participants_count'],
|
||||
following=-1, # does not exist for Telegram
|
||||
verified=False, #does not exist for Telegram
|
||||
date_created=dateutil.parser.parse(chat_raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
@@ -66,32 +125,30 @@ class TelegramTelethonTransformer(Transformer):
|
||||
fwd_from = None
|
||||
|
||||
if raw['fwd_from'] and raw['fwd_from']['from_id'] and 'channel_id' in raw['fwd_from']['from_id']:
|
||||
channel = session.query(Channel).filter_by(platform_id=raw['fwd_from']['from_id']['channel_id']).first()
|
||||
channel = session.query(Channel).filter_by(platform_id=str(raw['fwd_from']['from_id']['channel_id'])).first()
|
||||
|
||||
if channel is None:
|
||||
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
|
||||
(screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
|
||||
(screenname, name, notes) = self.get_screenname_from_id(raw['fwd_from']['from_id']['channel_id'])
|
||||
|
||||
if name == "":
|
||||
logger.info("Trying fallback web interface")
|
||||
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
|
||||
if orig_channel.screenname is not None:
|
||||
name = self.get_name_from_web_interface(orig_channel.screenname, raw['id'])
|
||||
|
||||
channel = Channel(
|
||||
name=name,
|
||||
platform_id=raw['fwd_from']['from_id']['channel_id'],
|
||||
platform=data.platform,
|
||||
url="https://t.me/s/" + screenname,
|
||||
url="https://t.me/s/" + screenname if screenname is not None else "",
|
||||
screenname=screenname,
|
||||
category='forwarded',
|
||||
source=self.__version__
|
||||
source=self.__version__,
|
||||
notes=notes
|
||||
)
|
||||
|
||||
channel = insert(channel)
|
||||
elif channel.screenname == "":
|
||||
# if the screenname is empty, we can fill it in
|
||||
orig_channel = session.query(Channel).filter_by(id=data.channel).first()
|
||||
(screenname, name) = self.get_screenname_from_id(orig_channel.screenname, raw['id'])
|
||||
|
||||
channel.screenname = screenname
|
||||
channel.name = name
|
||||
channel.url = "https://t.me/s/" + screenname
|
||||
session.flush()
|
||||
logger.info(f"Added {channel}")
|
||||
|
||||
fwd_from = channel.id
|
||||
|
||||
@@ -113,6 +170,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url="",
|
||||
content=raw['message'],
|
||||
author_id=raw['post_author'],
|
||||
@@ -123,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
ext = archived_url.split('.')[-1]
|
||||
# for k in data.archived_urls:
|
||||
# if data.archived_urls[k]:
|
||||
# archived_url = data.archived_urls[k]
|
||||
# ext = archived_url.split('.')[-1]
|
||||
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
else:
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
# else:
|
||||
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
|
||||
|
||||
@@ -1,138 +0,0 @@
|
||||
import sys
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
TelegramSnscrapeScraper)
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
logger.add("../russian_telegram_ingest.log")
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="QAnon Россия",
|
||||
platform_id=-1001319637748,
|
||||
category="Qanon",
|
||||
followers=94048,
|
||||
platform="Telegram",
|
||||
url="https://t.me/qanonrus",
|
||||
screenname="qanonrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="The Great Awakening | Q",
|
||||
platform_id=-1001325597521,
|
||||
category="Qanon",
|
||||
followers=5715,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakin",
|
||||
screenname="greatawakin",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="Великое Пробуждение",
|
||||
platform_id=-1001285898079,
|
||||
category="Qanon",
|
||||
followers=5861,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakeningrus",
|
||||
screenname="greatawakeningrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=3,
|
||||
name="T🕊Редакция Президент Гордон🕊",
|
||||
platform_id=-1001101170442,
|
||||
category="Qanon",
|
||||
followers=5743,
|
||||
platform="Telegram",
|
||||
url="https://t.me/prezidentgordonteam",
|
||||
screenname="prezidentgordonteam",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="ПРОЕКТ АВРОРА",
|
||||
platform_id=-1001279171101,
|
||||
category="Qanon",
|
||||
followers=5930,
|
||||
platform="Telegram",
|
||||
url="https://t.me/project_aurora",
|
||||
screenname="project_aurora",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Сон Разума",
|
||||
platform_id=-1001202338312,
|
||||
category="Qanon",
|
||||
followers=27099,
|
||||
platform="Telegram",
|
||||
url="https://t.me/error_288",
|
||||
screenname="error_288",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Пробуждающий Мир - официальный канал",
|
||||
platform_id=-1001492521207,
|
||||
category="Qanon",
|
||||
followers=19097,
|
||||
platform="Telegram",
|
||||
url="https://t.me/promirru",
|
||||
screenname="promirru",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="ЦЕЛЬНОЗОР",
|
||||
platform_id=-1001642737506,
|
||||
category="Qanon",
|
||||
followers=13654,
|
||||
platform="Telegram",
|
||||
url="https://t.me/tselnozor",
|
||||
screenname="tselnozor",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),]
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
telegram = TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
engine = create_engine('sqlite:///russian_telegram.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels, archive_media = False)
|
||||
|
||||
9
spacy_setup.sh
Normal file
9
spacy_setup.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
pipenv run python -m spacy download xx_ent_wiki_sm
|
||||
pipenv run python -m spacy download fr_core_news_sm
|
||||
pipenv run python -m spacy download de_core_news_sm
|
||||
pipenv run python -m spacy download nl_core_news_sm
|
||||
pipenv run python -m spacy download it_core_news_sm
|
||||
pipenv run python -m spacy download ru_core_news_sm
|
||||
pipenv run python -m spacy download en_core_web_sm
|
||||
134
sync_with_gsheet.py
Normal file
134
sync_with_gsheet.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import gspread
|
||||
import time
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ChannelInfo
|
||||
|
||||
def sync_channels(args, session):
|
||||
logger.info("Synchronizing channels")
|
||||
|
||||
gc = gspread.service_account(filename="service_account.json")
|
||||
|
||||
# Open a sheet from a spreadsheet in one go
|
||||
wks = gc.open_by_url(args.gsheet).worksheet("channels")
|
||||
channels = wks.get_all_records()
|
||||
row = 2
|
||||
|
||||
for c in channels:
|
||||
# defaults for unset values
|
||||
if c["public"] == "":
|
||||
c["public"] = True
|
||||
if c["chat"] == "":
|
||||
c["chat"] = False
|
||||
|
||||
# normalize the values slightly from the Google Sheet
|
||||
for k in c.keys():
|
||||
if c[k] == "TRUE" or c[k] == "yes":
|
||||
c[k] = True
|
||||
if c[k] == "FALSE" or c[k] == "no":
|
||||
c[k] = False
|
||||
|
||||
if c[k] == "":
|
||||
c[k] = None
|
||||
|
||||
# add new channel
|
||||
if c["id"] == "" or c["id"] is None:
|
||||
del c["id"]
|
||||
|
||||
# check to see if this already exists,
|
||||
platform_id = None
|
||||
if c["platform_id"] != "":
|
||||
platform_id = c["platform_id"]
|
||||
|
||||
channel = (
|
||||
session.query(Channel)
|
||||
.filter_by(
|
||||
platform_id=str(platform_id), platform=str(c["platform"])
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not channel:
|
||||
channel = session.query(Channel).filter_by(platform=str(c["platform"]), url=str(c["url"])).first()
|
||||
|
||||
if not channel and c["screenname"] != '' and c["screenname"] is not None:
|
||||
channel = session.query(Channel).filter_by(platform=str(c["platform"]), screenname=str(c["screenname"])).first()
|
||||
|
||||
if not channel:
|
||||
channel = Channel(**c, source="researcher")
|
||||
logger.debug(f"{channel} does not exist, adding")
|
||||
session.add(channel)
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
time.sleep(1)
|
||||
else:
|
||||
logger.info(f"Channel found, updating channel {channel}")
|
||||
was_researcher = channel.source == "researcher"
|
||||
|
||||
channel.name = c["name"]
|
||||
channel.category = c["category"]
|
||||
channel.platform = c["platform"]
|
||||
channel.url = c["url"]
|
||||
channel.screenname = c["screenname"]
|
||||
channel.country = c["country"]
|
||||
channel.influencer = c["influencer"]
|
||||
channel.public = c["public"]
|
||||
channel.chat = c["chat"]
|
||||
channel.notes = c["notes"]
|
||||
channel.source = "researcher"
|
||||
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
wks.update_cell(row, 1, channel.id)
|
||||
|
||||
# this likely means that the channel was duplicated in the Google Sheet, so add a red highlight
|
||||
if was_researcher:
|
||||
logger.warning(f"This channel (ID {channel.id}) is possibly a duplicate.")
|
||||
|
||||
wks.format(f"A{str(row)}:A{str(row)}", {
|
||||
"backgroundColor": {
|
||||
"red": 1.0,
|
||||
"green": 0.0,
|
||||
"blue": 0.0
|
||||
}})
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# channel has ID
|
||||
else:
|
||||
cid = int(c["id"])
|
||||
|
||||
channel = session.query(Channel).filter_by(id=cid).first()
|
||||
channel_info = session.query(ChannelInfo).filter_by(channel=cid).order_by(ChannelInfo.date_archived.desc()).first()
|
||||
|
||||
logger.info(f"Updating channel {channel}")
|
||||
logger.info(channel_info)
|
||||
|
||||
channel.name = c["name"]
|
||||
channel.category = c["category"]
|
||||
channel.platform = c["platform"]
|
||||
channel.url = c["url"]
|
||||
channel.screenname = c["screenname"]
|
||||
channel.country = c["country"]
|
||||
channel.influencer = c["influencer"]
|
||||
channel.public = c["public"]
|
||||
channel.chat = c["chat"]
|
||||
channel.notes = c["notes"]
|
||||
channel.source = "researcher"
|
||||
|
||||
if channel_info:
|
||||
channel.screenname = channel_info.screenname
|
||||
wks.update_cell(row, 7, channel_info.screenname)
|
||||
|
||||
channel.platform_id = channel_info.platform_id
|
||||
wks.update_cell(row, 3, channel_info.platform_id)
|
||||
|
||||
session.flush()
|
||||
session.commit()
|
||||
|
||||
row += 1
|
||||
|
||||
session.commit()
|
||||
Reference in New Issue
Block a user