mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 21:08:34 +03:00
Implement transformer for TwitterScraper that handles media; implement image OCR and EXIF extraction
This commit is contained in:
2
Pipfile
2
Pipfile
@@ -18,6 +18,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
|
||||
garc = "*"
|
||||
youtube-dl = "*"
|
||||
telethon = "*"
|
||||
pytesseract = "*"
|
||||
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
@@ -1,9 +1,16 @@
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
import pytesseract
|
||||
import PIL
|
||||
import io
|
||||
import exiftool
|
||||
import json
|
||||
import os
|
||||
|
||||
from .utils import make_request
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@@ -60,13 +67,14 @@ class TransformedResult:
|
||||
scraper: str
|
||||
transformer: str
|
||||
platform: str
|
||||
channel: str
|
||||
channel: int
|
||||
date: datetime
|
||||
date_archived: datetime
|
||||
url: str
|
||||
content: str
|
||||
author_id: str
|
||||
author_username: str
|
||||
content: str
|
||||
|
||||
|
||||
|
||||
analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
@@ -76,13 +84,78 @@ analysis_table = Table('analysis', mapper_registry.metadata,
|
||||
Column('scraper', String),
|
||||
Column('transformer', String),
|
||||
Column('platform', String),
|
||||
Column('channel', String),
|
||||
Column('channel', Integer),
|
||||
Column('date', DateTime),
|
||||
Column('date_archived', DateTime),
|
||||
Column('url', String),
|
||||
Column('content', String),
|
||||
Column('author_id', String),
|
||||
Column('author_username', String)
|
||||
Column('author_username', String),
|
||||
Column('content', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
mapper_registry.map_imperatively(TransformedResult, analysis_table)
|
||||
|
||||
@dataclass
|
||||
class Media:
|
||||
raw_id: int
|
||||
post: int
|
||||
url: str
|
||||
original_url: str
|
||||
|
||||
exif: str = None
|
||||
|
||||
def get_blob(self):
|
||||
blob = make_request(self.url)
|
||||
return blob.content
|
||||
|
||||
def hydrate(self, blob = None):
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
self.hydrate_exif(blob)
|
||||
|
||||
def hydrate_exif(self, blob):
|
||||
f = open('tmp', 'wb')
|
||||
f.write(blob)
|
||||
f.close()
|
||||
|
||||
with exiftool.ExifTool() as et:
|
||||
exif = et.get_metadata('tmp')
|
||||
self.exif = json.dumps(exif)
|
||||
|
||||
os.remove('tmp')
|
||||
|
||||
@dataclass
|
||||
class Image(Media):
|
||||
ocr: str = None
|
||||
|
||||
def hydrate(self, blob=None):
|
||||
if blob is None:
|
||||
blob = self.get_blob()
|
||||
|
||||
super().hydrate(blob)
|
||||
self.hydrate_ocr(blob)
|
||||
|
||||
def hydrate_ocr(self, blob):
|
||||
image = PIL.Image.open(io.BytesIO(blob))
|
||||
self.ocr = pytesseract.image_to_string(image)
|
||||
|
||||
@dataclass
|
||||
class Video(Media):
|
||||
pass
|
||||
|
||||
media_table = Table('media', mapper_registry.metadata,
|
||||
Column('id', Integer, primary_key=True,
|
||||
autoincrement=True),
|
||||
Column('type', String),
|
||||
Column('raw_id', Integer, ForeignKey('raw_data.id')),
|
||||
Column('post', Integer, ForeignKey('analysis.id')),
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
Column('ocr', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -1,4 +1,4 @@
|
||||
from .utils import make_request
|
||||
from cisticola.utils import make_request
|
||||
from .base import Scraper, ScraperController
|
||||
from .bitchute import BitchuteScraper
|
||||
from .gab import GabScraper
|
||||
|
||||
@@ -10,7 +10,7 @@ import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.scraper import make_request
|
||||
from cisticola.utils import make_request
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
@@ -94,7 +94,6 @@ class ScraperController:
|
||||
def __init__(self):
|
||||
self.scrapers = []
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
@@ -149,9 +148,3 @@ class ScraperController:
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
from typing import List, Generator
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
|
||||
|
||||
class Transformer:
|
||||
"""Interface class for transformers"""
|
||||
@@ -11,6 +15,75 @@ class Transformer:
|
||||
def can_handle(data: ScraperResult) -> bool:
|
||||
pass
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
pass
|
||||
|
||||
def transform(data: ScraperResult) -> TransformedResult:
|
||||
pass
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""This class will transform the raw_data tables into a format more conducive to analysis."""
|
||||
|
||||
def __init__(self):
|
||||
self.transformers = []
|
||||
|
||||
def register_transformer(self, transformer: Transformer):
|
||||
self.transformers.append(transformer)
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
@logger.catch
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
for result in results:
|
||||
for transformer in self.transformers:
|
||||
handled = False
|
||||
|
||||
if transformer.can_handle(result):
|
||||
logger.info(f"{transformer} is handling result {result}")
|
||||
handled = True
|
||||
session = self.session()
|
||||
|
||||
transformed = transformer.transform(result)
|
||||
|
||||
session.add(transformed)
|
||||
session.flush()
|
||||
|
||||
media = transformer.transform_media(result, transformed)
|
||||
|
||||
count = 0
|
||||
for obj in media:
|
||||
if hydrate:
|
||||
logger.info(f"Hydrating {obj}")
|
||||
obj.hydrate()
|
||||
|
||||
session.add(obj)
|
||||
count += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(f"{transformer} generated {count} media objects")
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
|
||||
@logger.catch
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
return
|
||||
|
||||
session = self.session()
|
||||
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
|
||||
logger.info(f"Found {len(untransformed)} items to ETL")
|
||||
|
||||
self.transform_results(untransformed, hydrate=hydrate)
|
||||
@@ -1,13 +1,51 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "TwitterTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "TwitterScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
if raw['media']:
|
||||
for media in raw['media']:
|
||||
orig = None
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
orig = media["fullUrl"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Gif":
|
||||
orig = media["variants"][0]["url"]
|
||||
elif media["_type"] == "snscrape.modules.twitter.Video":
|
||||
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
|
||||
orig = variant["url"]
|
||||
|
||||
if orig is None:
|
||||
logger.warning(f"No media URL found for {media}")
|
||||
elif orig not in data.archived_urls:
|
||||
logger.info("Media discovered but not archived")
|
||||
else:
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
if media["_type"] == "snscrape.modules.twitter.Photo":
|
||||
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
else:
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import requests
|
||||
from loguru import logger
|
||||
import time
|
||||
|
||||
def make_request(url, headers = None, max_retries = 5, break_codes = None):
|
||||
|
||||
@@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
|
||||
while r.status_code not in break_codes and n_retries < 5:
|
||||
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
|
||||
n_retries += 1
|
||||
|
||||
# back off subsequent requests
|
||||
time.sleep(n_retries)
|
||||
r = requests.get(url, headers = headers)
|
||||
|
||||
if r.status_code not in break_codes:
|
||||
97
test.py
97
test.py
@@ -1,7 +1,7 @@
|
||||
from sqlalchemy import create_engine
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.base import Channel, TransformedResult, ScraperResult
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
BitchuteScraper,
|
||||
@@ -12,6 +12,9 @@ from cisticola.scraper import (
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
from cisticola.transformer.base import ETLController
|
||||
from cisticola.transformer.twitter import TwitterTransformer
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
logger.add("../test.log")
|
||||
|
||||
@@ -29,87 +32,6 @@ test_channels = [
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=1,
|
||||
name="South West Ohio Proud Boys (test)",
|
||||
platform_id=-1001276612436,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Telegram",
|
||||
url="https://t.me/SouthwestOhioPB",
|
||||
screenname="SouthwestOhioPB",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=2,
|
||||
name="LizardRepublic (test)",
|
||||
platform_id='lizardrepublic',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic",
|
||||
screenname="lizardrepublic",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=4,
|
||||
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=5,
|
||||
name="Mak1n' Bacon (test)",
|
||||
platform_id='Mak1nBacon',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Odysee",
|
||||
url="https://odysee.com/@Mak1nBacon",
|
||||
screenname='Mak1nBacon',
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=6,
|
||||
name="Capt. Marc Simon (test)",
|
||||
platform_id='marc_capt',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Gab",
|
||||
url="https://gab.com/marc_capt",
|
||||
screenname='marc_capt',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes=""),
|
||||
Channel(
|
||||
id=7,
|
||||
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Rumble",
|
||||
url="https://rumble.com/c/c-916305",
|
||||
screenname='we are uploading',
|
||||
country="CA",
|
||||
influencer=None,
|
||||
public=True,
|
||||
chat=False,
|
||||
notes="")]
|
||||
|
||||
controller = ScraperController()
|
||||
@@ -126,7 +48,14 @@ scrapers = [
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels, archive_media = False)
|
||||
controller.scrape_channels(test_channels, archive_media = True)
|
||||
|
||||
transformer = TwitterTransformer()
|
||||
|
||||
etl_controller = ETLController()
|
||||
etl_controller.register_transformer(transformer)
|
||||
etl_controller.connect_to_db(engine)
|
||||
etl_controller.transform_all_untransformed()
|
||||
|
||||
Reference in New Issue
Block a user