commit 82ad210b8e3e10eb97fdd03ba0d44799a2a94051 Author: Logan Williams Date: Fri Feb 18 14:01:49 2022 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..35ddc38 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.DS_Store +*.pyc +*.ipynb +*.db diff --git a/cisticola/__init__.py b/cisticola/__init__.py new file mode 100644 index 0000000..e462aee --- /dev/null +++ b/cisticola/__init__.py @@ -0,0 +1,70 @@ +from typing import List +from datetime import datetime +from dataclasses import dataclass +import cisticola.scraper +import cisticola.base +from sqlalchemy.orm import sessionmaker + + +class ScraperController: + """Registers scrapers, uses them to generate ScraperResults. Synchronizes + everything with database via ORM.""" + + def __init__(self): + self.scrapers = [] + self.session = None + self.mapper_registry = None + + def register_scraper(self, scraper: cisticola.base.Scraper): + self.scrapers.append(scraper) + + def scrape_channels(self, channels: List[cisticola.base.Channel]): + if self.session is None: + cisticola.base.logger.error("No DB session") + return + + for channel in channels: + handled = False + + for scraper in self.scrapers: + if scraper.can_handle(channel): + # get most recent post + session = self.session() + rows = session.query(cisticola.base.ScraperResult).order_by( + cisticola.base.ScraperResult.date_archived).limit(1).all() + + if len(rows) == 1: + since = rows[0] + else: + since = None + + posts = scraper.get_posts(channel, since=since) + handled = True + + cisticola.base.logger.info( + f"{scraper} found {len(posts)} new posts from {channel}") + break + + if not handled: + cisticola.base.logger.warning( + f"No handler found for Channel {channel}") + + session = self.session() + session.bulk_save_objects(posts) + session.commit() + + cisticola.base.logger.info(f"Added {len(posts)} entries to database") + + def connect_to_db(self, engine): + # create tables + cisticola.base.mapper_registry.metadata.create_all(bind=engine) + + self.session = sessionmaker() + self.session.configure(bind=engine) + + +class ETLController: + """This class will transform the raw_data tables into a format more conducive to analysis.""" + + def __init__(self): + pass diff --git a/cisticola/base.py b/cisticola/base.py new file mode 100644 index 0000000..6641eee --- /dev/null +++ b/cisticola/base.py @@ -0,0 +1,65 @@ +from dataclasses import dataclass +from datetime import datetime +from typing import List +from sqlalchemy.orm import registry +from sqlalchemy import Table, Column, Integer, String, DateTime +from loguru import logger + +mapper_registry = registry() + + +@dataclass +class ScraperResult: + """A minimally processed result from a scraper""" + scraper: str + platform: str + channel: str + platform_id: str + date: datetime + raw_data: str + date_archived: datetime + + +raw_data_table = Table('raw_data', mapper_registry.metadata, + Column('id', Integer, primary_key=True), + Column('scraper', String), + Column('platform', String), + Column('channel', String), + Column('platform_id', String), + Column('date', DateTime), + Column('raw_data', String), + Column('date_archived', DateTime)) + +mapper_registry.map_imperatively(ScraperResult, raw_data_table) + + +class Scraper: + __version__ = "Scraper 0.0.1" + + def __init__(self): + pass + + def __str__(self): + return self.__version__ + + def can_handle(self, channel) -> bool: + pass + + def get_posts(self, channel, since=None) -> List[ScraperResult]: + pass + + +@dataclass +class Channel: + id: int + name: str + platform_id: str + category: str + followers: int + platform: str + url: str + country: str + influencer: str + public: bool + chat: bool + notes: str diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py new file mode 100644 index 0000000..0025f73 --- /dev/null +++ b/cisticola/scraper/twitter.py @@ -0,0 +1,39 @@ +import cisticola.base +from datetime import datetime +from typing import List +import snscrape.modules + +class TwitterScraper(cisticola.base.Scraper): + """An implementation of a Scraper for Twitter, using snscrape library""" + __version__ = "TwitterScraper 0.0.1" + + # TODO remove this, should be able to scrape from user ID alone + def get_username_from_url(url): + username = url.split("twitter.com/")[1] + if len(username.split("/")) > 1: + return None + + return username + + def get_posts(self, channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + posts = [] + scraper = snscrape.modules.twitter.TwitterUserScraper( + TwitterScraper.get_username_from_url(channel.url)) + + for tweet in scraper.get_items(): + if since is not None and tweet.id <= int(since.platform_id): + break + + posts.append(cisticola.base.ScraperResult(scraper=self.__version__, + platform="Twitter", + channel=channel.id, + platform_id=tweet.id, + date=tweet.date, + date_archived=datetime.now(), + raw_data=tweet.json())) + + return posts + + def can_handle(self, channel): + if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None: + return True diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py new file mode 100644 index 0000000..82fc879 --- /dev/null +++ b/cisticola/transformer/__init__.py @@ -0,0 +1,19 @@ +class TwitterResult(ScraperResult): + """A Twitter specific ScraperResult, with a method ETL/transforming""" + + def transform(self) -> TransformedResult: + data = json.loads(self.raw_data) + + transformed = TransformedResult( + scraper=self.scraper, + transformer=self.__version__, + platform=self.platform, + channel=self.channel, + date=self.date, + date_archived=self.date_archived, + url=data['url'], + content=data['content'], + author_id=data['user']['id'], + author_username=data['user']['username']) + + return transformed \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..e134f9a --- /dev/null +++ b/test.py @@ -0,0 +1,29 @@ +# TODO/TODECIDE: +# should 'username' be a part of the Channel definition somehow? +# still need to do some planning for handling media + +import cisticola +import cisticola.scraper.twitter +from sqlalchemy import create_engine + + +test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132, + category="test", followers=None, platform="Twitter", + url="https://twitter.com/obtusatum", country="US", + influencer=None, public=True, chat=False, + notes=""), + cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026, + category="qanon", followers=None, platform="Telegram", + url="https://t.me/jqhnspartan", country="FR", + influencer="JQNH SPARTAN", public=True, chat=False, notes="")] + + +controller = cisticola.ScraperController() + +scraper = cisticola.scraper.twitter.TwitterScraper() +controller.register_scraper(scraper) + +engine = create_engine('sqlite:///test.db') +controller.connect_to_db(engine) + +controller.scrape_channels(test_channels)