mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 05:18:33 +03:00
Bug fixes in transformers
This commit is contained in:
@@ -192,20 +192,21 @@ class Post:
|
||||
def hydrate(self):
|
||||
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
|
||||
|
||||
urls = re.findall(URL_REGEX, self.content)
|
||||
# replace is here in order to prevent catastrophic backtracking
|
||||
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
|
||||
self.outlinks = urls
|
||||
|
||||
HASHTAG_REGEX = r"(?:^|\s)[##]{1}(\w+)"
|
||||
|
||||
hashtags = re.findall(HASHTAG_REGEX, self.content)
|
||||
self.hashtags = hashtags
|
||||
|
||||
|
||||
# regex patterns for finding crypto addresses
|
||||
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
|
||||
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
|
||||
|
||||
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
|
||||
|
||||
|
||||
try:
|
||||
self.detected_language = detect(self.content)
|
||||
except LangDetectException:
|
||||
|
||||
@@ -156,20 +156,21 @@ class ETLController:
|
||||
session = self.session()
|
||||
|
||||
for result in results:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
if result.scraper is not None and result.platform is not None:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
|
||||
if transformer.can_handle(result):
|
||||
logger.trace(f"{transformer} is handling result {result}")
|
||||
handled = True
|
||||
if transformer.can_handle(result):
|
||||
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
|
||||
handled = True
|
||||
|
||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
|
||||
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
|
||||
|
||||
session.commit()
|
||||
break
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
@@ -187,17 +188,33 @@ class ETLController:
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
untransformed = (
|
||||
session.query(ScraperResult)
|
||||
.filter_by(platform="Telegram")
|
||||
.filter(ScraperResult.raw_data.notlike("%MessageService%"))
|
||||
|
||||
BATCH_SIZE = 50000
|
||||
offset = 0
|
||||
batch = []
|
||||
|
||||
query = (session.query(ScraperResult)
|
||||
# .filter_by(platform="Telegram")
|
||||
# .filter(ScraperResult.raw_data.notlike("%MessageService%"))
|
||||
.join(Post, isouter=True)
|
||||
.where(Post.raw_id == None)
|
||||
# .order_by(func.random())
|
||||
.order_by(ScraperResult.date.asc())
|
||||
.limit(100000)
|
||||
.all()
|
||||
)
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
while len(batch) > 0 or offset == 0:
|
||||
logger.info(f"Fetching untransformed batch of {BATCH_SIZE}, offset {offset}")
|
||||
|
||||
batch = query.slice(offset, offset + BATCH_SIZE).all()
|
||||
# untransformed = (
|
||||
|
||||
# .limit(BATCH_SIZE)
|
||||
# .offset(offset)
|
||||
# .all()
|
||||
# )
|
||||
|
||||
offset += BATCH_SIZE
|
||||
|
||||
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
|
||||
|
||||
self.transform_results(batch, hydrate=hydrate)
|
||||
|
||||
@@ -63,6 +63,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
soup = BeautifulSoup(r.content)
|
||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
|
||||
name = ""
|
||||
|
||||
# multiple posts can be combined into one result in the web interface
|
||||
decrement = 0
|
||||
@@ -76,7 +77,6 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
if len(post) == 0:
|
||||
logger.warning(f"Could not find post from {url}")
|
||||
name = ""
|
||||
else:
|
||||
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})
|
||||
|
||||
|
||||
@@ -1,138 +0,0 @@
|
||||
import sys
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
TelegramSnscrapeScraper)
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
logger.add("../russian_telegram_ingest.log")
|
||||
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="QAnon Россия",
|
||||
platform_id=-1001319637748,
|
||||
category="Qanon",
|
||||
followers=94048,
|
||||
platform="Telegram",
|
||||
url="https://t.me/qanonrus",
|
||||
screenname="qanonrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="The Great Awakening | Q",
|
||||
platform_id=-1001325597521,
|
||||
category="Qanon",
|
||||
followers=5715,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakin",
|
||||
screenname="greatawakin",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="Великое Пробуждение",
|
||||
platform_id=-1001285898079,
|
||||
category="Qanon",
|
||||
followers=5861,
|
||||
platform="Telegram",
|
||||
url="https://t.me/greatawakeningrus",
|
||||
screenname="greatawakeningrus",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=3,
|
||||
name="T🕊Редакция Президент Гордон🕊",
|
||||
platform_id=-1001101170442,
|
||||
category="Qanon",
|
||||
followers=5743,
|
||||
platform="Telegram",
|
||||
url="https://t.me/prezidentgordonteam",
|
||||
screenname="prezidentgordonteam",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="ПРОЕКТ АВРОРА",
|
||||
platform_id=-1001279171101,
|
||||
category="Qanon",
|
||||
followers=5930,
|
||||
platform="Telegram",
|
||||
url="https://t.me/project_aurora",
|
||||
screenname="project_aurora",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Сон Разума",
|
||||
platform_id=-1001202338312,
|
||||
category="Qanon",
|
||||
followers=27099,
|
||||
platform="Telegram",
|
||||
url="https://t.me/error_288",
|
||||
screenname="error_288",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Пробуждающий Мир - официальный канал",
|
||||
platform_id=-1001492521207,
|
||||
category="Qanon",
|
||||
followers=19097,
|
||||
platform="Telegram",
|
||||
url="https://t.me/promirru",
|
||||
screenname="promirru",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="ЦЕЛЬНОЗОР",
|
||||
platform_id=-1001642737506,
|
||||
category="Qanon",
|
||||
followers=13654,
|
||||
platform="Telegram",
|
||||
url="https://t.me/tselnozor",
|
||||
screenname="tselnozor",
|
||||
country="RU",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),]
|
||||
|
||||
controller = ScraperController()
|
||||
|
||||
telegram = TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
engine = create_engine('sqlite:///russian_telegram.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels, archive_media = False)
|
||||
|
||||
Reference in New Issue
Block a user