cisticola/cisticola/base.py

from typing import List
from dataclasses import dataclass
from datetime import datetime
import tempfile
import json
import io

from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey, Boolean
import pytesseract
import PIL
import exiftool

from .utils import make_request

@dataclass
class ScraperResult:
    """A minimally processed result from a scraper
    """

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: Foreign key of channel ID that this was scraped from
    channel: int

    #: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
    platform_id: str

    #: Datetime (relative to UTC) that the scraped post was created at.
    date: datetime

    #: JSON dump of dict that contains all data scraped for the post.
    raw_data: str

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

    #: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
    archived_urls: dict

    #: Has the media in this post been archived?
    media_archived: bool

@dataclass
class Channel:
    """Information about a specific channel to be scraped.
    """

    #: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
    name: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    platform_id: str

    #: User-specified category for the channel, e.g. ``"explicit_qanon"``.
    category: str

    #: Name of platform the given channel is on, e.g. ``"Telegram"``.
    platform: str

    #: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
    url: str

    #: Screen name/username of channel.
    screenname: str

    #: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
    country: str = None

    #: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
    influencer: str = None

    #: Whether or not the channel is publicly-accessible.
    public: bool = None

    #: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
    chat: bool = None

    #: Any other additional notes about the channel.
    notes: str = ""

    #: Did the channel come from a researcher or a scraping process?
    source: str = None

    def hydrate(self):
        pass

@dataclass
class Post:
    """An object with fields for columns in the analysis table"""

    #: ID number of the scraped post in the ``raw_data`` table
    raw_id: int

    #: Platform specific post ID
    platform_id: str

    #: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
    scraper: str

    #: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
    transformer: str

    #: Name of platform from which result was scraped, e.g. ``"Twitter"``.
    platform: str

    #: User-specified integer that uniquely identifies a channel, e.g. ``15``.
    channel: int

    #: Datetime (relative to UTC) that the scraped post was created at.
    date: datetime

    #: Datetime (relative to UTC) that the scraped post was archived at.
    date_archived: datetime

    #: URL of the original post
    url: str

    #: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
    author_id: str

    #: Username of author who made post.
    author_username: str

    #: Text of the original post
    content: str

    #: The ID of the Channel that the post was forwarded or quoted from
    forwarded_from: int = None

    #: The ID of the Post that this Post is a reply to or reblog of
    reply_to: int = None

    def hydrate(self):
        pass


@dataclass
class Media:
    """Base class for organizing information about a media file.
    """

    #: ID number of the media's corresponding scraped post in the ``raw_data`` table.
    raw_id: int

    #: ID number of the media's corresponging scraped post in the ``analysis`` table.
    post: int

    #: URL of the original post.
    url: str

    #: Original URL of the media from the the original post.
    original_url: str

    #: JSON dump of the dict containing metadata information for the media file.
    exif: str = None

    def get_blob(self):
        """Download media file as bytes blob.
        """

        blob = make_request(self.url)
        return blob.content

    def hydrate(self, blob = None):
        """Download media file as bytes blob and extract data from content.
        """

        if blob is None:
            blob = self.get_blob()

        self.hydrate_exif(blob)

    def hydrate_exif(self, blob):
        """Extract Exif metadata from bytes blob.
        """

        with tempfile.NamedTemporaryFile() as temp_file:
            temp_file.write(blob)

            with exiftool.ExifTool() as et:
                exif = et.get_metadata(temp_file.name)
                self.exif = json.dumps(exif)

@dataclass
class Image(Media):
    """Class for organizing information about an image file.
    """

    #: Extracted OCR content from image
    ocr: str = None

    def hydrate(self, blob=None):
        """Download image file as bytes blob and extract Exif and OCR content
        from the image.
        """

        if blob is None:
            blob = self.get_blob()

        super().hydrate(blob)
        self.hydrate_ocr(blob)

    def hydrate_ocr(self, blob):
        """Extract OCR (optical character recognition) data from image bytes blob.
        """

        image = PIL.Image.open(io.BytesIO(blob))
        self.ocr = pytesseract.image_to_string(image)

@dataclass
class Video(Media):
    """Class for organizing information about an image file.
    """

    pass

mapper_registry = registry()

raw_data_table = Table('raw_data', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('scraper', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id')),
                       Column('platform_id', String),
                       Column('date', DateTime),
                       Column('raw_data', String),
                       Column('date_archived', DateTime),
                       Column('archived_urls', JSON),
                       Column('media_archived', Boolean))

channel_table = Table('channels', mapper_registry.metadata,
                    Column('id', Integer, primary_key=True, autoincrement=True),
                    Column('name', String),
                    Column('platform_id', Integer),
                    Column('category', String),
                    Column('platform', String),
                    Column('url', String),
                    Column('screenname', String),
                    Column('country', String),
                    Column('influencer', String),
                    Column('public', Boolean),
                    Column('chat', Boolean),
                    Column('notes', String),
                    Column('source', String)
                    )

post_table = Table('posts', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
                       Column('platform_id', Integer),
                       Column('scraper', String),
                       Column('transformer', String),
                       Column('platform', String),
                       Column('channel', Integer, ForeignKey('channels.id')),
                       Column('date', DateTime),
                       Column('date_archived', DateTime),
                       Column('url', String),
                       Column('author_id', String),
                       Column('author_username', String),
                       Column('content', String),
                       Column('forwarded_from', Integer, ForeignKey('channels.id')),
                       Column('reply_to', Integer, ForeignKey('posts.id'))
                       )

media_table = Table('media', mapper_registry.metadata,
                       Column('id', Integer, primary_key=True,
                              autoincrement=True),
                       Column('type', String),
                       Column('raw_id', Integer, ForeignKey('raw_data.id')),
                       Column('post', Integer, ForeignKey('posts.id')),
                       Column('url', String),
                       Column('original_url', String),
                       Column('exif', String),
                       Column('ocr', String))

mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')