mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
refactored import structure
This commit is contained in:
@@ -1,13 +1,17 @@
|
||||
from typing import Generator, Tuple
|
||||
import cisticola.base
|
||||
import requests
|
||||
from typing import Generator, Tuple, List
|
||||
import os
|
||||
import boto3
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
import boto3
|
||||
from loguru import logger
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
@@ -89,8 +93,77 @@ class Scraper:
|
||||
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: cisticola.base.Channel) -> bool:
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None) -> Generator[ScraperResult, None, None]:
|
||||
pass
|
||||
|
||||
|
||||
class ScraperController:
|
||||
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
|
||||
everything with database via ORM."""
|
||||
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[Channel]):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(ScraperResult).where(
|
||||
ScraperResult.channel == channel.id).order_by(
|
||||
ScraperResult.date.desc()).limit(1).all()
|
||||
|
||||
if len(rows) == 1:
|
||||
since = rows[0]
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
added += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user