Files
cisticola/cisticola/base.py
2022-03-14 18:04:27 -05:00

151 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper
"""
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#TODO there is probably a way of making this a Channel object foreign key
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: int
#: String that uniquely identifies the scraped post on the given platform, e.g. ``"1503397267675533313"``
platform_id: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: JSON dump of dict that contains all data scraped for the post.
raw_data: str
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
"""
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
id: int
#: Name of channel (different from username because it can be non-unique and contain emojis), e.g. ``T🕊Редакция Президент Гордон🕊"``.
name: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
platform_id: str
#: User-specified category for the channel, e.g. ``"qanon-adjacent"``.
category: str
#: Number of followers the channel has on the given platform, e.e. ``"1465"``.
followers: int
#: Name of platform the given channel is on, e.g. ``"Telegram"``.
platform: str
#: URL for the given channel on the platform, e.g. ``"https://t.me/prezidentgordonteam"``
url: str
#: Screen name/username of channel.
screenname: str
#: 2 digit country code for the country of origin for the channel, e.g. ``"RU"``.
country: str
#: Name of influencer, if channel belongs to an influencer that operates on multiple platforms.
influencer: str
#: Whether or not the channel is publicly-accessible.
public: bool
#: Whether or not the channel is a chat (i.e. allows users who are not the channel creator to post/message)
chat: bool
#: Any other additional notes about the channel.
notes: str
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
#: ID number of the scraped post in the ``raw_data`` table
raw_id: int
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: User-specified integer that uniquely identifies a channel, e.g. ``15``.
channel: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: URL of the original post
url: str
#: Text of the original post
content: str
#: String that uniquely identifies the channel on the given platform, e.g. ``"-1001101170442"``.
author_id: str
#: Username of author who made post.
author_username: str
mapper_registry = registry()
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
analysis_table = Table('analysis', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', String),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('content', String),
Column('author_id', String),
Column('author_username', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)