From c5d49ef52158702a31057596901ff6ed31f5f90c Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 18 Feb 2022 14:14:25 +0100 Subject: [PATCH] Reorganize class definitions slightly --- cisticola/__init__.py | 14 ++++++-------- cisticola/base.py | 18 ------------------ cisticola/scraper/__init__.py | 18 ++++++++++++++++++ cisticola/scraper/twitter.py | 20 +++++++++++--------- cisticola/transformer/__init__.py | 17 ++++++++++++++++- test.py | 2 +- 6 files changed, 52 insertions(+), 37 deletions(-) diff --git a/cisticola/__init__.py b/cisticola/__init__.py index e462aee..115d143 100644 --- a/cisticola/__init__.py +++ b/cisticola/__init__.py @@ -1,9 +1,8 @@ from typing import List -from datetime import datetime -from dataclasses import dataclass import cisticola.scraper import cisticola.base from sqlalchemy.orm import sessionmaker +from loguru import logger class ScraperController: @@ -15,12 +14,12 @@ class ScraperController: self.session = None self.mapper_registry = None - def register_scraper(self, scraper: cisticola.base.Scraper): + def register_scraper(self, scraper: cisticola.scraper.Scraper): self.scrapers.append(scraper) def scrape_channels(self, channels: List[cisticola.base.Channel]): if self.session is None: - cisticola.base.logger.error("No DB session") + logger.error("No DB session") return for channel in channels: @@ -41,19 +40,18 @@ class ScraperController: posts = scraper.get_posts(channel, since=since) handled = True - cisticola.base.logger.info( + logger.info( f"{scraper} found {len(posts)} new posts from {channel}") break if not handled: - cisticola.base.logger.warning( - f"No handler found for Channel {channel}") + logger.warning(f"No handler found for Channel {channel}") session = self.session() session.bulk_save_objects(posts) session.commit() - cisticola.base.logger.info(f"Added {len(posts)} entries to database") + logger.info(f"Added {len(posts)} entries to database") def connect_to_db(self, engine): # create tables diff --git a/cisticola/base.py b/cisticola/base.py index 6641eee..7d03abe 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,9 +1,7 @@ from dataclasses import dataclass from datetime import datetime -from typing import List from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, DateTime -from loguru import logger mapper_registry = registry() @@ -33,22 +31,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata, mapper_registry.map_imperatively(ScraperResult, raw_data_table) -class Scraper: - __version__ = "Scraper 0.0.1" - - def __init__(self): - pass - - def __str__(self): - return self.__version__ - - def can_handle(self, channel) -> bool: - pass - - def get_posts(self, channel, since=None) -> List[ScraperResult]: - pass - - @dataclass class Channel: id: int diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index e69de29..0053198 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -0,0 +1,18 @@ +from typing import List +import cisticola.base + + +class Scraper: + __version__ = "Scraper 0.0.1" + + def __init__(self): + pass + + def __str__(self): + return self.__version__ + + def can_handle(self, channel: cisticola.base.Channel) -> bool: + pass + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + pass diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index 0025f73..4873e40 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -3,11 +3,13 @@ from datetime import datetime from typing import List import snscrape.modules -class TwitterScraper(cisticola.base.Scraper): + +class TwitterScraper(cisticola.scraper.Scraper): """An implementation of a Scraper for Twitter, using snscrape library""" __version__ = "TwitterScraper 0.0.1" - # TODO remove this, should be able to scrape from user ID alone + # TODO snscrape should be able to scrape from user ID alone, but there is + # currently a bug/other issue, so it is extracting the username from URL def get_username_from_url(url): username = url.split("twitter.com/")[1] if len(username.split("/")) > 1: @@ -15,7 +17,7 @@ class TwitterScraper(cisticola.base.Scraper): return username - def get_posts(self, channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: posts = [] scraper = snscrape.modules.twitter.TwitterUserScraper( TwitterScraper.get_username_from_url(channel.url)) @@ -25,12 +27,12 @@ class TwitterScraper(cisticola.base.Scraper): break posts.append(cisticola.base.ScraperResult(scraper=self.__version__, - platform="Twitter", - channel=channel.id, - platform_id=tweet.id, - date=tweet.date, - date_archived=datetime.now(), - raw_data=tweet.json())) + platform="Twitter", + channel=channel.id, + platform_id=tweet.id, + date=tweet.date, + date_archived=datetime.now(), + raw_data=tweet.json())) return posts diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 82fc879..fcda068 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,3 +1,18 @@ +@dataclass +class TransformedResult: + """An object with fields for columns in the analysis table""" + scraper: str + transformer: str + platform: str + channel: str + date: datetime + date_archived: datetime + url: str + content: str + author_id: str + author_username: str + + class TwitterResult(ScraperResult): """A Twitter specific ScraperResult, with a method ETL/transforming""" @@ -16,4 +31,4 @@ class TwitterResult(ScraperResult): author_id=data['user']['id'], author_username=data['user']['username']) - return transformed \ No newline at end of file + return transformed diff --git a/test.py b/test.py index e134f9a..678bcdb 100644 --- a/test.py +++ b/test.py @@ -23,7 +23,7 @@ controller = cisticola.ScraperController() scraper = cisticola.scraper.twitter.TwitterScraper() controller.register_scraper(scraper) -engine = create_engine('sqlite:///test.db') +engine = create_engine('sqlite:///test2.db') controller.connect_to_db(engine) controller.scrape_channels(test_channels)