refactored import structure

This commit is contained in:
Tristan Lee
2022-03-04 10:55:54 -06:00
parent 75240bb060
commit c21e43ddfa
16 changed files with 418 additions and 216 deletions

View File

@@ -1,13 +1,17 @@
from typing import Generator, Tuple
import cisticola.base
import requests
from typing import Generator, Tuple, List
import os
import boto3
from io import BytesIO
from urllib.parse import urlparse
import tempfile
import requests
import boto3
from loguru import logger
import ffmpeg
import tempfile
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, ScraperResult, mapper_registry
class Scraper:
__version__ = "Scraper 0.0.0"
@@ -89,8 +93,77 @@ class Scraper:
return archived_url
def can_handle(self, channel: cisticola.base.Channel) -> bool:
def can_handle(self, channel: Channel) -> bool:
pass
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
pass
class ScraperController:
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
everything with database via ORM."""
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: Scraper):
self.scrapers.append(scraper)
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
def scrape_channels(self, channels: List[Channel]):
if self.session is None:
logger.error("No DB session")
return
for channel in channels:
handled = False
for scraper in self.scrapers:
if scraper.can_handle(channel):
session = self.session()
handled = True
added = 0
# get most recent post
session = self.session()
rows = session.query(ScraperResult).where(
ScraperResult.channel == channel.id).order_by(
ScraperResult.date.desc()).limit(1).all()
if len(rows) == 1:
since = rows[0]
else:
since = None
posts = scraper.get_posts(channel, since=since)
for post in posts:
session.add(post)
added += 1
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")
break
if not handled:
logger.warning(f"No handler found for Channel {channel}")
def connect_to_db(self, engine):
# create tables
mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass