cisticola/cisticola/base.py

from typing import List
from dataclasses import dataclass, field
from datetime import datetime
import tempfile
import json
import io

from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract
import PIL
import exiftool
import re
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from loguru import logger
import spacy

from .utils import make_request

@dataclass
class ScraperResult:
    """A minimally processed result from a scraper
    """

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: Foreign key of channel ID that this was scraped from
    channel: int

    #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
    platform_id: str

    #: Datetime (relative to UTC) that the scraped post was created at.
    date: datetime

    #: JSON dump of dict that contains all data scraped for the post.
    raw_data: str

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

    #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
    archived_urls: dict

    #: What date was the media archived? (None if not archived)
    media_archived: datetime

@dataclass
class Channel:
    """Information about a specific channel to be scraped.
    """

    #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
    name: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    platform_id: str

    #: User-specified category for the channel, e.g. ``"explicit_qanon"``.
    category: str

    #: Name of platform the given channel is on, e.g. ``"Telegram"``.
    platform: str

    #: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
    url: str

    #: Screen name/username of channel.
    screenname: str

    #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
    country: str = None

    #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
    influencer: str = None

    #: Whether or not the channel is publicly-accessible.
    public: bool = None

    #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
    chat: bool = None

    #: Any other additional notes about the channel.
    notes: str = ""

    #: Did the channel come from a researcher or a scraping process?
    source: str = None

    def hydrate(self):
        pass

@dataclass
class RawChannelInfo:
    """A minimally processed result from a scraper
    """

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: Foreign key of channel ID that this was scraped from
    channel: int

    #: JSON dump of dict that contains all data scraped for the post.
    raw_data: str

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

@dataclass
class ChannelInfo:
    """A processed set of information about a channel.
    """

    # Foreign key from the raw_channel_info table
    raw_channel_info_id: int

    # Foreign ckey from the channels table
    channel: int

    # platform specific ID of the channel
    platform_id: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
    transformer: str

    #: attributes extracted from the raw channel info object
    screenname: str
    name: str
    description: str
    description_url: str
    description_location: str
    followers: int
    following: int
    verified: bool
    date_created: datetime

    #: Datetime (relative to UTC) that the scraped channel info was archived at.
    date_archived: datetime

    #: Datetime (UTC) that the scraped channel info was transformed at.
    date_transformed: datetime

    def hydrate(self):
        pass

nlp_en = spacy.load('en_core_web_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_de = spacy.load('de_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_it = spacy.load('it_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_fr = spacy.load('fr_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_ru = spacy.load('ru_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_nl = spacy.load('nl_core_news_sm', disable=['parser', 'tok2vec', 'attribute_ruler'])
nlp_xx = spacy.load('xx_ent_wiki_sm')

@dataclass
class Post:
    """An object with fields for columns in the analysis table"""

    #: ID number of the scraped post in the ``raw_posts`` table
    raw_id: int

    #: Platform specific post ID
    platform_id: str

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
    transformer: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: User-specified integer that uniquely identifies a channel, e.g. ``15``.
    channel: int

    #: Datetime (relative to UTC) that the scraped post was created at.
    date: datetime

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

    #: Datetime (UTC) that the scraped post was transformed at.
    date_transformed: datetime

    #: URL of the original post
    url: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    author_id: str

    #: Username of author who made post.
    author_username: str

    #: Text of the original post
    content: str

    #: Named entities detected in post
    named_entities: list = field(default_factory=list)

    #: Any cryptocurrency addresses in post
    cryptocurrency_addresses: list = field(default_factory=list)

    #: Hashtags in post
    hashtags: list = field(default_factory=list)

    #: Links to any other websites
    outlinks: list = field(default_factory=list)

    #: Detected language of post
    detected_language: str = ""

    #: Normalized post content
    normalized_content: str = ""

    #: The ID of the Channel that the post was forwarded or quoted from
    forwarded_from: int = None

    #: The ID of the Post that this Post is a reply to or reblog of
    reply_to: int = None

    def hydrate(self):
        URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

        # replace is here in order to prevent catastrophic backtracking
        urls = re.findall(URL_REGEX, self.content.replace("::::::::", ""))
        self.outlinks = urls

        HASHTAG_REGEX = r"(?:^|\s)[＃#]{1}(\w+)"

        hashtags = re.findall(HASHTAG_REGEX, self.content)
        self.hashtags = hashtags

        # regex patterns for finding crypto addresses
        BTC_REGEX = r'\b(bc(0([ac-hj-np-z02-9]{39}|[ac-hj-np-z02-9]{59})|1[ac-hj-np-z02-9]{8,87})|[13][a-km-zA-HJ-NP-Z1-9]{25,35})\b'
        ETHER_REGEX = r'(0x[a-fA-F0-9]{40})'

        self.cryptocurrency_addresses = [m[0] for m in re.findall(BTC_REGEX, self.content)] + re.findall(ETHER_REGEX, self.content)

        try:
            self.detected_language = detect(self.content)
        except LangDetectException:
            self.detected_language = ""

        self.hydrate_spacy()

    def hydrate_spacy(self):
        ner_only = False

        if self.detected_language == 'en':
            nlp = nlp_en
        elif self.detected_language == 'de':
            nlp = nlp_de
        elif self.detected_language == 'it':
            nlp = nlp_it
        elif self.detected_language == 'fr':
            nlp = nlp_fr
        elif self.detected_language == 'ru':
            nlp = nlp_ru
        elif self.detected_language == 'nl':
            nlp = nlp_nl
        else:
            logger.info(f"No language model for {self.detected_language}")
            nlp = nlp_xx
            ner_only = True

        doc = nlp(self.content)

        if not ner_only:
            punctuation = ['?',':','!',',','.',';','|','(',')','--','#','=','+']
            tokens = [t.lemma_ for t in doc if not t.is_stop and t.lemma_ not in punctuation]
            self.normalized_content = ' '.join(tokens)
        else:
            self.normalized_content = ''

        self.named_entities = [{'text': ent.text, 'type': ent.label_} for ent in doc.ents]


@dataclass
class Media:
    """Base class for organizing information about a media file.
    """

    #: ID number of the media's corresponding scraped post in the ``raw_posts`` table.
    raw_id: int

    #: ID number of the media's corresponging scraped post in the ``analysis`` table.
    post: int

    #: URL of the original post.
    url: str

    #: Original URL of the media from the the original post.
    original_url: str

    #: JSON dump of the dict containing metadata information for the media file.
    exif: str = None

    def get_blob(self):
        """Download media file as bytes blob.
        """

        blob = make_request(self.url)
        return blob.content

    def hydrate(self, blob = None):
        """Download media file as bytes blob and extract data from content.
        """

        if blob is None:
            blob = self.get_blob()

        self.hydrate_exif(blob)

    def hydrate_exif(self, blob):
        """Extract Exif metadata from bytes blob.
        """

        with tempfile.NamedTemporaryFile() as temp_file:
            temp_file.write(blob)

            with exiftool.ExifTool() as et:
                exif = et.get_metadata(temp_file.name)
                self.exif = json.dumps(exif)

@dataclass
class Image(Media):
    """Class for organizing information about an image file.
    """

    #: Extracted OCR content from image
    ocr: str = None

    def hydrate(self, blob=None):
        """Download image file as bytes blob and extract Exif and OCR content
        from the image.
        """

        if blob is None:
            blob = self.get_blob()

        super().hydrate(blob)
        self.hydrate_ocr(blob)

    def hydrate_ocr(self, blob):
        """Extract OCR (optical character recognition) data from image bytes blob.
        """

        image = PIL.Image.open(io.BytesIO(blob))
        self.ocr = pytesseract.image_to_string(image)

@dataclass
class Video(Media):
    """Class for organizing information about an image file.
    """

    pass

mapper_registry = registry()

raw_posts_table = Table('raw_posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('scraper', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
                       Column('platform_id', String, index=True),
                       Column('date', DateTime, index=True),
                       Column('raw_data', String),
                       Column('date_archived', DateTime, index=True),
                       Column('archived_urls', JSON),
                       Column('media_archived', DateTime, index=True))

raw_channel_info_table = Table('raw_channel_info', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True),
                    Column('scraper', String),
                    Column('platform', String),
                    Column('channel', Integer, ForeignKey('channels.id'), index=True),
                    Column('raw_data', String),
                    Column('date_archived', DateTime, index=True))

channel_info_table = Table('channel_info', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
                    Column('raw_channel_info_id', Integer, ForeignKey('raw_channel_info.id'), index=True),
                    Column('channel', Integer, ForeignKey('channels.id'), index=True),
                    Column('screenname', String),
                    Column('name', String),
                    Column('description', String),
                    Column('description_url', String),
                    Column('description_location', String),
                    Column('followers', Integer),
                    Column('following', Integer),
                    Column('verified', Boolean),
                    Column('date_created', DateTime),
                    Column('date_archived', DateTime, index=True),
                    Column('date_transformed', DateTime, index=True)
                    )

channel_table = Table('channels', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
                    Column('name', String),
                    Column('platform_id', String),
                    Column('category', String),
                    Column('platform', String),
                    Column('url', String),
                    Column('screenname', String),
                    Column('country', String),
                    Column('influencer', String),
                    Column('public', Boolean),
                    Column('chat', Boolean),
                    Column('notes', String),
                    Column('source', String)
                    )

post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('raw_id', Integer, ForeignKey('raw_posts.id'), index=True),
                       Column('platform_id', Integer, index=True),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id'), index=True),
                       Column('date', DateTime, index=True),
                       Column('date_archived', DateTime, index=True),
                       Column('date_transformed', DateTime, index=True),
                       Column('url', String),
                       Column('author_id', String),
                       Column('author_username', String),
                       Column('content', String),
                       Column('forwarded_from', Integer, ForeignKey('channels.id'), index=True),
                       Column('reply_to', Integer, ForeignKey('posts.id', ondelete="CASCADE"), index=True),
                       Column('named_entities', JSON),
                       Column('cryptocurrency_addresses', JSON),
                       Column('hashtags', JSON),
                       Column('outlinks', JSON),
                       Column('detected_language', String),
                       Column('normalized_content', String)
                       )

media_table = Table('media', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('type', String),
                       Column('raw_id', Integer, ForeignKey('raw_posts.id')),
                       Column('post', Integer, ForeignKey('posts.id')),
                       Column('url', String),
                       Column('original_url', String),
                       Column('exif', String),
                       Column('ocr', String))

mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_posts_table)
mapper_registry.map_imperatively(RawChannelInfo, raw_channel_info_table)
mapper_registry.map_imperatively(ChannelInfo, channel_info_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')