Files
cisticola/cisticola/base.py
2022-05-18 09:17:49 +01:00

472 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from dataclasses import dataclass, field
from datetime import datetime
import tempfile
import json
import io
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract
import PIL
import exiftool
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from loguru import logger
import spacy
from .utils import make_request
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
platform_id: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
#: What date was the media archived? (None if not archived)
media_archived: datetime
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str
#: User-specified category for the channel, e.g. ``"explicit_qanon"``.
category: str
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
url: str
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str = None
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str = None
#: Whether or not the channel is publicly-accessible.
public: bool = None
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool = None
#: Any other additional notes about the channel.
notes: str = ""
#: Did the channel come from a researcher or a scraping process?
source: str = None
def hydrate(self):
pass
@dataclass
class RawChannelInfo:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Foreign key of channel ID that this was scraped from
channel: int
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
@dataclass
class ChannelInfo:
"""A processed set of information about a channel.
"""
# Foreign key from the raw_channel_info table
raw_channel_info_id: int
# Foreign ckey from the channels table
channel: int
# platform specific ID of the channel
platform_id: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: attributes extracted from the raw channel info object
screenname: str
name: str
description: str
description_url: str
description_location: str
followers: int
following: int
verified: bool
date_created: datetime
#: Datetime (relative to UTC) that the scraped channel info was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped channel info was transformed at.
date_transformed: datetime
def hydrate(self):
pass
nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_fr = spacy.load('fr_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_xx = spacy.load('xx_ent_wiki_sm')
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_posts`` table
raw_id: int
#: Platform specific post ID
platform_id: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: int
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped post was transformed at.
date_transformed: datetime
#: URL of the original post
url: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
#: Text of the original post
content: str
#: Named entities detected in post
named_entities: list = field(default_factory=list)
#: Any cryptocurrency addresses in post
cryptocurrency_addresses: list = field(default_factory=list)
#: Hashtags in post
hashtags: list = field(default_factory=list)
#: Links to any other websites
outlinks: list = field(default_factory=list)
#: Detected language of post
detected_language: str = ""
#: Normalized post content
normalized_content: str = ""
#: The ID of the Channel that the post was forwarded or quoted from
forwarded_from: int = None
#: The ID of the Post that this Post is a reply to or reblog of
reply_to: int = None
def hydrate(self):
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
# replace is here in order to prevent catastrophic backtracking
urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
self.outlinks = urls
HASHTAG_REGEX = r"(?:^|\s)[#]{1}(\w+)"
hashtags = re.findall(HASHTAG_REGEX, self.content)
self.hashtags = hashtags
# regex patterns for finding crypto addresses
BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'
self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)
try:
self.detected_language = detect(self.content)
except LangDetectException:
self.detected_language = ""
self.hydrate_spacy()
def hydrate_spacy(self):
ner_only = False
if self.detected_language == 'en':
nlp = nlp_en
elif self.detected_language == 'de':
nlp = nlp_de
elif self.detected_language == 'it':
nlp = nlp_it
elif self.detected_language == 'fr':
nlp = nlp_fr
elif self.detected_language == 'ru':
nlp = nlp_ru
elif self.detected_language == 'nl':
nlp = nlp_nl
else:
logger.info(f"No language model for {self.detected_language}")
nlp = nlp_xx
ner_only = True
doc = nlp(self.content)
if not ner_only:
punctuation = ['?',':','!',',','.',';','|','(',')','--','#','=','+']
tokens = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation]
self.normalized_content = ' '.join(tokens)
else:
self.normalized_content = ''
self.named_entities = [{'text': ent.text, 'type': ent.label_} for ent in doc.ents]
@dataclass
class Media:
"""Base class for organizing information about a media file.
"""
#: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
raw_id: int
#: ID number of the media's corresponging scraped post in the ``analysis`` table.
post: int
#: URL of the original post.
url: str
#: Original URL of the media from the the original post.
original_url: str
#: JSON dump of the dict containing metadata information for the media file.
exif: str = None
def get_blob(self):
"""Download media file as bytes blob.
"""
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
"""Download media file as bytes blob and extract data from content.
"""
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
"""Extract Exif metadata from bytes blob.
"""
with tempfile.NamedTemporaryFile() as temp_file:
temp_file.write(blob)
with exiftool.ExifTool() as et:
exif = et.get_metadata(temp_file.name)
self.exif = json.dumps(exif)
@dataclass
class Image(Media):
"""Class for organizing information about an image file.
"""
#: Extracted OCR content from image
ocr: str = None
def hydrate(self, blob=None):
"""Download image file as bytes blob and extract Exif and OCR content
from the image.
"""
if blob is None:
blob = self.get_blob()
super().hydrate(blob)
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
"""Extract OCR (optical character recognition) data from image bytes blob.
"""
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
"""Class for organizing information about an image file.
"""
pass
mapper_registry = registry()
raw_posts_table = Table('raw_posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('platform_id', String, index=True),
Column('date', DateTime, index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True),
Column('archived_urls', JSON),
Column('media_archived', DateTime, index=True))
raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('raw_data', String),
Column('date_archived', DateTime, index=True))
channel_info_table = Table('channel_info', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('screenname', String),
Column('name', String),
Column('description', String),
Column('description_url', String),
Column('description_location', String),
Column('followers', Integer),
Column('following', Integer),
Column('verified', Boolean),
Column('date_created', DateTime),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True)
)
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', String),
Column('category', String),
Column('platform', String),
Column('url', String),
Column('screenname', String),
Column('country', String),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
)
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
Column('platform_id', Integer, index=True),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id'), index=True),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
Column('reply_to', Integer, ForeignKey('posts.id', ondelete="CASCADE"), index=True),
Column('named_entities', JSON),
Column('cryptocurrency_addresses', JSON),
Column('hashtags', JSON),
Column('outlinks', JSON),
Column('detected_language', String),
Column('normalized_content', String)
)
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_posts.id')),
Column('post', Integer, ForeignKey('posts.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String))
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')