Initial commit

This commit is contained in:
Logan Williams
2022-02-18 14:01:49 +01:00
commit 82ad210b8e
7 changed files with 226 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
.DS_Store
*.pyc
*.ipynb
*.db

70
cisticola/__init__.py Normal file
View File

@@ -0,0 +1,70 @@
from typing import List
from datetime import datetime
from dataclasses import dataclass
import cisticola.scraper
import cisticola.base
from sqlalchemy.orm import sessionmaker
class ScraperController:
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
everything with database via ORM."""
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: cisticola.base.Scraper):
self.scrapers.append(scraper)
def scrape_channels(self, channels: List[cisticola.base.Channel]):
if self.session is None:
cisticola.base.logger.error("No DB session")
return
for channel in channels:
handled = False
for scraper in self.scrapers:
if scraper.can_handle(channel):
# get most recent post
session = self.session()
rows = session.query(cisticola.base.ScraperResult).order_by(
cisticola.base.ScraperResult.date_archived).limit(1).all()
if len(rows) == 1:
since = rows[0]
else:
since = None
posts = scraper.get_posts(channel, since=since)
handled = True
cisticola.base.logger.info(
f"{scraper} found {len(posts)} new posts from {channel}")
break
if not handled:
cisticola.base.logger.warning(
f"No handler found for Channel {channel}")
session = self.session()
session.bulk_save_objects(posts)
session.commit()
cisticola.base.logger.info(f"Added {len(posts)} entries to database")
def connect_to_db(self, engine):
# create tables
cisticola.base.mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass

65
cisticola/base.py Normal file
View File

@@ -0,0 +1,65 @@
from dataclasses import dataclass
from datetime import datetime
from typing import List
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, DateTime
from loguru import logger
mapper_registry = registry()
@dataclass
class ScraperResult:
"""A minimally processed result from a scraper"""
scraper: str
platform: str
channel: str
platform_id: str
date: datetime
raw_data: str
date_archived: datetime
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True),
Column('scraper', String),
Column('platform', String),
Column('channel', String),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime))
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
class Scraper:
__version__ = "Scraper 0.0.1"
def __init__(self):
pass
def __str__(self):
return self.__version__
def can_handle(self, channel) -> bool:
pass
def get_posts(self, channel, since=None) -> List[ScraperResult]:
pass
@dataclass
class Channel:
id: int
name: str
platform_id: str
category: str
followers: int
platform: str
url: str
country: str
influencer: str
public: bool
chat: bool
notes: str

View File

View File

@@ -0,0 +1,39 @@
import cisticola.base
from datetime import datetime
from typing import List
import snscrape.modules
class TwitterScraper(cisticola.base.Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
# TODO remove this, should be able to scrape from user ID alone
def get_username_from_url(url):
username = url.split("twitter.com/")[1]
if len(username.split("/")) > 1:
return None
return username
def get_posts(self, channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
posts = []
scraper = snscrape.modules.twitter.TwitterUserScraper(
TwitterScraper.get_username_from_url(channel.url))
for tweet in scraper.get_items():
if since is not None and tweet.id <= int(since.platform_id):
break
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
return posts
def can_handle(self, channel):
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None:
return True

View File

@@ -0,0 +1,19 @@
class TwitterResult(ScraperResult):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
def transform(self) -> TransformedResult:
data = json.loads(self.raw_data)
transformed = TransformedResult(
scraper=self.scraper,
transformer=self.__version__,
platform=self.platform,
channel=self.channel,
date=self.date,
date_archived=self.date_archived,
url=data['url'],
content=data['content'],
author_id=data['user']['id'],
author_username=data['user']['username'])
return transformed

29
test.py Normal file
View File

@@ -0,0 +1,29 @@
# TODO/TODECIDE:
# should 'username' be a part of the Channel definition somehow?
# still need to do some planning for handling media
import cisticola
import cisticola.scraper.twitter
from sqlalchemy import create_engine
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
category="test", followers=None, platform="Twitter",
url="https://twitter.com/obtusatum", country="US",
influencer=None, public=True, chat=False,
notes=""),
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
category="qanon", followers=None, platform="Telegram",
url="https://t.me/jqhnspartan", country="FR",
influencer="JQNH SPARTAN", public=True, chat=False, notes="")]
controller = cisticola.ScraperController()
scraper = cisticola.scraper.twitter.TwitterScraper()
controller.register_scraper(scraper)
engine = create_engine('sqlite:///test.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels)