mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Initial commit
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
.DS_Store
|
||||
*.pyc
|
||||
*.ipynb
|
||||
*.db
|
||||
70
cisticola/__init__.py
Normal file
70
cisticola/__init__.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
import cisticola.scraper
|
||||
import cisticola.base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
class ScraperController:
|
||||
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
|
||||
everything with database via ORM."""
|
||||
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: cisticola.base.Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[cisticola.base.Channel]):
|
||||
if self.session is None:
|
||||
cisticola.base.logger.error("No DB session")
|
||||
return
|
||||
|
||||
for channel in channels:
|
||||
handled = False
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(cisticola.base.ScraperResult).order_by(
|
||||
cisticola.base.ScraperResult.date_archived).limit(1).all()
|
||||
|
||||
if len(rows) == 1:
|
||||
since = rows[0]
|
||||
else:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
handled = True
|
||||
|
||||
cisticola.base.logger.info(
|
||||
f"{scraper} found {len(posts)} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
cisticola.base.logger.warning(
|
||||
f"No handler found for Channel {channel}")
|
||||
|
||||
session = self.session()
|
||||
session.bulk_save_objects(posts)
|
||||
session.commit()
|
||||
|
||||
cisticola.base.logger.info(f"Added {len(posts)} entries to database")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
cisticola.base.mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
65
cisticola/base.py
Normal file
65
cisticola/base.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, DateTime
|
||||
from loguru import logger
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperResult:
|
||||
"""A minimally processed result from a scraper"""
|
||||
scraper: str
|
||||
platform: str
|
||||
channel: str
|
||||
platform_id: str
|
||||
date: datetime
|
||||
raw_data: str
|
||||
date_archived: datetime
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True),
|
||||
Column('scraper', String),
|
||||
Column('platform', String),
|
||||
Column('channel', String),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.1"
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def can_handle(self, channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel, since=None) -> List[ScraperResult]:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Channel:
|
||||
id: int
|
||||
name: str
|
||||
platform_id: str
|
||||
category: str
|
||||
followers: int
|
||||
platform: str
|
||||
url: str
|
||||
country: str
|
||||
influencer: str
|
||||
public: bool
|
||||
chat: bool
|
||||
notes: str
|
||||
0
cisticola/scraper/__init__.py
Normal file
0
cisticola/scraper/__init__.py
Normal file
39
cisticola/scraper/twitter.py
Normal file
39
cisticola/scraper/twitter.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import cisticola.base
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
import snscrape.modules
|
||||
|
||||
class TwitterScraper(cisticola.base.Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
# TODO remove this, should be able to scrape from user ID alone
|
||||
def get_username_from_url(url):
|
||||
username = url.split("twitter.com/")[1]
|
||||
if len(username.split("/")) > 1:
|
||||
return None
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
posts = []
|
||||
scraper = snscrape.modules.twitter.TwitterUserScraper(
|
||||
TwitterScraper.get_username_from_url(channel.url))
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.id <= int(since.platform_id):
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=tweet.json()))
|
||||
|
||||
return posts
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
19
cisticola/transformer/__init__.py
Normal file
19
cisticola/transformer/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
class TwitterResult(ScraperResult):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
def transform(self) -> TransformedResult:
|
||||
data = json.loads(self.raw_data)
|
||||
|
||||
transformed = TransformedResult(
|
||||
scraper=self.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=self.platform,
|
||||
channel=self.channel,
|
||||
date=self.date,
|
||||
date_archived=self.date_archived,
|
||||
url=data['url'],
|
||||
content=data['content'],
|
||||
author_id=data['user']['id'],
|
||||
author_username=data['user']['username'])
|
||||
|
||||
return transformed
|
||||
29
test.py
Normal file
29
test.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# TODO/TODECIDE:
|
||||
# should 'username' be a part of the Channel definition somehow?
|
||||
# still need to do some planning for handling media
|
||||
|
||||
import cisticola
|
||||
import cisticola.scraper.twitter
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
|
||||
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
category="test", followers=None, platform="Twitter",
|
||||
url="https://twitter.com/obtusatum", country="US",
|
||||
influencer=None, public=True, chat=False,
|
||||
notes=""),
|
||||
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
|
||||
category="qanon", followers=None, platform="Telegram",
|
||||
url="https://t.me/jqhnspartan", country="FR",
|
||||
influencer="JQNH SPARTAN", public=True, chat=False, notes="")]
|
||||
|
||||
|
||||
controller = cisticola.ScraperController()
|
||||
|
||||
scraper = cisticola.scraper.twitter.TwitterScraper()
|
||||
controller.register_scraper(scraper)
|
||||
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
Reference in New Issue
Block a user