Bug fixes in transformers

This commit is contained in:
Logan Williams
2022-05-13 15:39:01 +00:00
parent 34da733e7c
commit 7f55b721dd
4 changed files with 40 additions and 160 deletions

View File

@@ -192,20 +192,21 @@ class Post:
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
urls = re.findall(URL_REGEX, self.content)
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks = urls
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags = hashtags
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
try:
self.detected_language = detect(self.content)
except LangDetectException:

View File

@@ -156,20 +156,21 @@ class ETLController:
session = self.session()
for result in results:
for transformer in self.transformers:
handled = False
if result.scraper is not None and result.platform is not None:
for transformer in self.transformers:
handled = False
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result}")
handled = True
if transformer.can_handle(result):
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
handled = True
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
transformer.transform(result, lambda obj: self.insert_or_select(obj, session, hydrate), session)
session.commit()
break
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle {result}")
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed(self, hydrate: bool = True):
@@ -187,17 +188,33 @@ class ETLController:
return
session = self.session()
untransformed = (
session.query(ScraperResult)
.filter_by(platform="Telegram")
.filter(ScraperResult.raw_data.notlike("%MessageService%"))
BATCH_SIZE = 50000
offset = 0
batch = []
query = (session.query(ScraperResult)
# .filter_by(platform="Telegram")
# .filter(ScraperResult.raw_data.notlike("%MessageService%"))
.join(Post, isouter=True)
.where(Post.raw_id == None)
# .order_by(func.random())
.order_by(ScraperResult.date.asc())
.limit(100000)
.all()
)
logger.info(f"Found {len(untransformed)} items to ETL")
self.transform_results(untransformed, hydrate=hydrate)
while len(batch) > 0 or offset == 0:
logger.info(f"Fetching untransformed batch of {BATCH_SIZE}, offset {offset}")
batch = query.slice(offset, offset + BATCH_SIZE).all()
# untransformed = (
# .limit(BATCH_SIZE)
# .offset(offset)
# .all()
# )
offset += BATCH_SIZE
logger.info(f"Found {len(batch)} items to ETL ({offset} already processed)")
self.transform_results(batch, hydrate=hydrate)

View File

@@ -63,6 +63,7 @@ class TelegramTelethonTransformer(Transformer):
soup = BeautifulSoup(r.content)
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
name = ""
# multiple posts can be combined into one result in the web interface
decrement = 0
@@ -76,7 +77,6 @@ class TelegramTelethonTransformer(Transformer):
if len(post) == 0:
logger.warning(f"Could not find post from {url}")
name = ""
else:
fwd_tag = post[0].findAll("a", {"class", "tgme_widget_message_forwarded_from_name"})

View File

@@ -1,138 +0,0 @@
import sys
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.scraper import (
ScraperController,
TelegramSnscrapeScraper)
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add("../russian_telegram_ingest.log")
test_channels = [
Channel(
id=0,
name="QAnon Россия",
platform_id=-1001319637748,
category="Qanon",
followers=94048,
platform="Telegram",
url="https://t.me/qanonrus",
screenname="qanonrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="The Great Awakening | Q",
platform_id=-1001325597521,
category="Qanon",
followers=5715,
platform="Telegram",
url="https://t.me/greatawakin",
screenname="greatawakin",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="Великое Пробуждение",
platform_id=-1001285898079,
category="Qanon",
followers=5861,
platform="Telegram",
url="https://t.me/greatawakeningrus",
screenname="greatawakeningrus",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=3,
name="T🕊Редакция Президент Гордон🕊",
platform_id=-1001101170442,
category="Qanon",
followers=5743,
platform="Telegram",
url="https://t.me/prezidentgordonteam",
screenname="prezidentgordonteam",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="ПРОЕКТ АВРОРА",
platform_id=-1001279171101,
category="Qanon",
followers=5930,
platform="Telegram",
url="https://t.me/project_aurora",
screenname="project_aurora",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Сон Разума",
platform_id=-1001202338312,
category="Qanon",
followers=27099,
platform="Telegram",
url="https://t.me/error_288",
screenname="error_288",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Пробуждающий Мир - официальный канал",
platform_id=-1001492521207,
category="Qanon",
followers=19097,
platform="Telegram",
url="https://t.me/promirru",
screenname="promirru",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="ЦЕЛЬНОЗОР",
platform_id=-1001642737506,
category="Qanon",
followers=13654,
platform="Telegram",
url="https://t.me/tselnozor",
screenname="tselnozor",
country="RU",
influencer=None,
public=True,
chat=False,
notes=""),]
controller = ScraperController()
telegram = TelegramSnscrapeScraper()
controller.register_scraper(telegram)
engine = create_engine('sqlite:///russian_telegram.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels, archive_media = False)