Reorganize class definitions slightly

This commit is contained in:
Logan Williams
2022-02-18 14:14:25 +01:00
parent 82ad210b8e
commit c5d49ef521
6 changed files with 52 additions and 37 deletions

View File

@@ -1,9 +1,8 @@
from typing import List
from datetime import datetime
from dataclasses import dataclass
import cisticola.scraper
import cisticola.base
from sqlalchemy.orm import sessionmaker
from loguru import logger
class ScraperController:
@@ -15,12 +14,12 @@ class ScraperController:
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: cisticola.base.Scraper):
def register_scraper(self, scraper: cisticola.scraper.Scraper):
self.scrapers.append(scraper)
def scrape_channels(self, channels: List[cisticola.base.Channel]):
if self.session is None:
cisticola.base.logger.error("No DB session")
logger.error("No DB session")
return
for channel in channels:
@@ -41,19 +40,18 @@ class ScraperController:
posts = scraper.get_posts(channel, since=since)
handled = True
cisticola.base.logger.info(
logger.info(
f"{scraper} found {len(posts)} new posts from {channel}")
break
if not handled:
cisticola.base.logger.warning(
f"No handler found for Channel {channel}")
logger.warning(f"No handler found for Channel {channel}")
session = self.session()
session.bulk_save_objects(posts)
session.commit()
cisticola.base.logger.info(f"Added {len(posts)} entries to database")
logger.info(f"Added {len(posts)} entries to database")
def connect_to_db(self, engine):
# create tables

View File

@@ -1,9 +1,7 @@
from dataclasses import dataclass
from datetime import datetime
from typing import List
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, DateTime
from loguru import logger
mapper_registry = registry()
@@ -33,22 +31,6 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
class Scraper:
__version__ = "Scraper 0.0.1"
def __init__(self):
pass
def __str__(self):
return self.__version__
def can_handle(self, channel) -> bool:
pass
def get_posts(self, channel, since=None) -> List[ScraperResult]:
pass
@dataclass
class Channel:
id: int

View File

@@ -0,0 +1,18 @@
from typing import List
import cisticola.base
class Scraper:
__version__ = "Scraper 0.0.1"
def __init__(self):
pass
def __str__(self):
return self.__version__
def can_handle(self, channel: cisticola.base.Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
pass

View File

@@ -3,11 +3,13 @@ from datetime import datetime
from typing import List
import snscrape.modules
class TwitterScraper(cisticola.base.Scraper):
class TwitterScraper(cisticola.scraper.Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
# TODO remove this, should be able to scrape from user ID alone
# TODO snscrape should be able to scrape from user ID alone, but there is
# currently a bug/other issue, so it is extracting the username from URL
def get_username_from_url(url):
username = url.split("twitter.com/")[1]
if len(username.split("/")) > 1:
@@ -15,7 +17,7 @@ class TwitterScraper(cisticola.base.Scraper):
return username
def get_posts(self, channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
scraper = snscrape.modules.twitter.TwitterUserScraper(
TwitterScraper.get_username_from_url(channel.url))
@@ -25,12 +27,12 @@ class TwitterScraper(cisticola.base.Scraper):
break
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
return posts

View File

@@ -1,3 +1,18 @@
@dataclass
class TransformedResult:
"""An object with fields for columns in the analysis table"""
scraper: str
transformer: str
platform: str
channel: str
date: datetime
date_archived: datetime
url: str
content: str
author_id: str
author_username: str
class TwitterResult(ScraperResult):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -16,4 +31,4 @@ class TwitterResult(ScraperResult):
author_id=data['user']['id'],
author_username=data['user']['username'])
return transformed
return transformed

View File

@@ -23,7 +23,7 @@ controller = cisticola.ScraperController()
scraper = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(scraper)
engine = create_engine('sqlite:///test.db')
engine = create_engine('sqlite:///test2.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)