Implement transformer for TwitterScraper that handles media; implement image OCR and EXIF extraction

This commit is contained in:
Logan Williams
2022-03-10 15:34:24 +01:00
parent 6cf3b8842d
commit fa5037d67c
8 changed files with 214 additions and 102 deletions

View File

@@ -18,6 +18,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"}
garc = "*"
youtube-dl = "*"
telethon = "*"
pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
[dev-packages]
pytest = "*"

View File

@@ -1,9 +1,16 @@
from typing import List
from dataclasses import dataclass
from datetime import datetime
from sqlalchemy.orm import registry
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
import pytesseract
import PIL
import io
import exiftool
import json
import os
from .utils import make_request
mapper_registry = registry()
@@ -60,13 +67,14 @@ class TransformedResult:
scraper: str
transformer: str
platform: str
channel: str
channel: int
date: datetime
date_archived: datetime
url: str
content: str
author_id: str
author_username: str
content: str
analysis_table = Table('analysis', mapper_registry.metadata,
@@ -76,13 +84,78 @@ analysis_table = Table('analysis', mapper_registry.metadata,
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', String),
Column('channel', Integer),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('content', String),
Column('author_id', String),
Column('author_username', String)
Column('author_username', String),
Column('content', String)
)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(TransformedResult, analysis_table)
@dataclass
class Media:
raw_id: int
post: int
url: str
original_url: str
exif: str = None
def get_blob(self):
blob = make_request(self.url)
return blob.content
def hydrate(self, blob = None):
if blob is None:
blob = self.get_blob()
self.hydrate_exif(blob)
def hydrate_exif(self, blob):
f = open('tmp', 'wb')
f.write(blob)
f.close()
with exiftool.ExifTool() as et:
exif = et.get_metadata('tmp')
self.exif = json.dumps(exif)
os.remove('tmp')
@dataclass
class Image(Media):
ocr: str = None
def hydrate(self, blob=None):
if blob is None:
blob = self.get_blob()
super().hydrate(blob)
self.hydrate_ocr(blob)
def hydrate_ocr(self, blob):
image = PIL.Image.open(io.BytesIO(blob))
self.ocr = pytesseract.image_to_string(image)
@dataclass
class Video(Media):
pass
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('type', String),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('post', Integer, ForeignKey('analysis.id')),
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String)
)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -1,4 +1,4 @@
from .utils import make_request
from cisticola.utils import make_request
from .base import Scraper, ScraperController
from .bitchute import BitchuteScraper
from .gab import GabScraper

View File

@@ -10,7 +10,7 @@ import ffmpeg
from sqlalchemy.orm import sessionmaker
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.scraper import make_request
from cisticola.utils import make_request
class Scraper:
__version__ = "Scraper 0.0.0"
@@ -94,7 +94,6 @@ class ScraperController:
def __init__(self):
self.scrapers = []
self.session = None
self.mapper_registry = None
def register_scraper(self, scraper: Scraper):
self.scrapers.append(scraper)
@@ -149,9 +148,3 @@ class ScraperController:
self.session = sessionmaker()
self.session.configure(bind=engine)
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
pass

View File

@@ -1,4 +1,8 @@
from cisticola.base import ScraperResult, TransformedResult
from typing import List, Generator
from loguru import logger
from sqlalchemy.orm import sessionmaker
from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry
class Transformer:
"""Interface class for transformers"""
@@ -11,6 +15,75 @@ class Transformer:
def can_handle(data: ScraperResult) -> bool:
pass
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
pass
def transform(data: ScraperResult) -> TransformedResult:
pass
class ETLController:
"""This class will transform the raw_data tables into a format more conducive to analysis."""
def __init__(self):
self.transformers = []
def register_transformer(self, transformer: Transformer):
self.transformers.append(transformer)
def connect_to_db(self, engine):
# create tables
mapper_registry.metadata.create_all(bind=engine)
self.session = sessionmaker()
self.session.configure(bind=engine)
@logger.catch
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
if self.session is None:
logger.error("No DB session")
return
for result in results:
for transformer in self.transformers:
handled = False
if transformer.can_handle(result):
logger.info(f"{transformer} is handling result {result}")
handled = True
session = self.session()
transformed = transformer.transform(result)
session.add(transformed)
session.flush()
media = transformer.transform_media(result, transformed)
count = 0
for obj in media:
if hydrate:
logger.info(f"Hydrating {obj}")
obj.hydrate()
session.add(obj)
count += 1
session.commit()
logger.info(f"{transformer} generated {count} media objects")
break
if handled == False:
logger.warning(f"No Transformer could handle {result}")
@logger.catch
def transform_all_untransformed(self, hydrate: bool = True):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all()
logger.info(f"Found {len(untransformed)} items to ETL")
self.transform_results(untransformed, hydrate=hydrate)

View File

@@ -1,13 +1,51 @@
import json
from loguru import logger
from typing import Generator
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
__version__ = "TwitterTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "TwitterScraper":
return True
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
if raw['media']:
for media in raw['media']:
orig = None
if media["_type"] == "snscrape.modules.twitter.Photo":
orig = media["fullUrl"]
elif media["_type"] == "snscrape.modules.twitter.Gif":
orig = media["variants"][0]["url"]
elif media["_type"] == "snscrape.modules.twitter.Video":
variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"])
orig = variant["url"]
if orig is None:
logger.warning(f"No media URL found for {media}")
elif orig not in data.archived_urls:
logger.info("Media discovered but not archived")
else:
new = data.archived_urls[orig]
if media["_type"] == "snscrape.modules.twitter.Photo":
m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
else:
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
raw = json.loads(data.raw_data)

View File

@@ -1,5 +1,6 @@
import requests
from loguru import logger
import time
def make_request(url, headers = None, max_retries = 5, break_codes = None):
@@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None):
while r.status_code not in break_codes and n_retries < 5:
logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}")
n_retries += 1
# back off subsequent requests
time.sleep(n_retries)
r = requests.get(url, headers = headers)
if r.status_code not in break_codes:

97
test.py
View File

@@ -1,7 +1,7 @@
from sqlalchemy import create_engine
from loguru import logger
from cisticola.base import Channel
from cisticola.base import Channel, TransformedResult, ScraperResult
from cisticola.scraper import (
ScraperController,
BitchuteScraper,
@@ -12,6 +12,9 @@ from cisticola.scraper import (
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
from cisticola.transformer.base import ETLController
from cisticola.transformer.twitter import TwitterTransformer
from sqlalchemy.orm import sessionmaker
logger.add("../test.log")
@@ -29,87 +32,6 @@ test_channels = [
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=1,
name="South West Ohio Proud Boys (test)",
platform_id=-1001276612436,
category="test",
followers=None,
platform="Telegram",
url="https://t.me/SouthwestOhioPB",
screenname="SouthwestOhioPB",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=2,
name="LizardRepublic (test)",
platform_id='lizardrepublic',
category="test",
followers=None,
platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic",
screenname="lizardrepublic",
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=4,
name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom',
category="test",
followers=None,
platform="Bitchute",
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None,
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=5,
name="Mak1n' Bacon (test)",
platform_id='Mak1nBacon',
category="test",
followers=None,
platform="Odysee",
url="https://odysee.com/@Mak1nBacon",
screenname='Mak1nBacon',
country="US",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=6,
name="Capt. Marc Simon (test)",
platform_id='marc_capt',
category="test",
followers=None,
platform="Gab",
url="https://gab.com/marc_capt",
screenname='marc_capt',
country="CA",
influencer=None,
public=True,
chat=False,
notes=""),
Channel(
id=7,
name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305',
category="test",
followers=None,
platform="Rumble",
url="https://rumble.com/c/c-916305",
screenname='we are uploading',
country="CA",
influencer=None,
public=True,
chat=False,
notes="")]
controller = ScraperController()
@@ -126,7 +48,14 @@ scrapers = [
controller.register_scrapers(scrapers)
engine = create_engine('sqlite:///test3.db')
engine = create_engine('sqlite:///test.db')
controller.connect_to_db(engine)
controller.scrape_channels(test_channels, archive_media = False)
controller.scrape_channels(test_channels, archive_media = True)
transformer = TwitterTransformer()
etl_controller = ETLController()
etl_controller.register_transformer(transformer)
etl_controller.connect_to_db(engine)
etl_controller.transform_all_untransformed()