From fa5037d67c9fa2c11e940c5241746cfc788a4400 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 10 Mar 2022 15:34:24 +0100 Subject: [PATCH] Implement transformer for TwitterScraper that handles media; implement image OCR and EXIF extraction --- Pipfile | 2 + cisticola/base.py | 87 +++++++++++++++++++++++++--- cisticola/scraper/__init__.py | 2 +- cisticola/scraper/base.py | 9 +-- cisticola/transformer/base.py | 75 +++++++++++++++++++++++- cisticola/transformer/twitter.py | 40 ++++++++++++- cisticola/{scraper => }/utils.py | 4 ++ test.py | 97 +++++--------------------------- 8 files changed, 214 insertions(+), 102 deletions(-) rename cisticola/{scraper => }/utils.py (96%) diff --git a/Pipfile b/Pipfile index 62f2c74..5f86225 100644 --- a/Pipfile +++ b/Pipfile @@ -18,6 +18,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" youtube-dl = "*" telethon = "*" +pytesseract = "*" +pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} [dev-packages] pytest = "*" diff --git a/cisticola/base.py b/cisticola/base.py index 97a18df..94065a2 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,9 +1,16 @@ from typing import List from dataclasses import dataclass from datetime import datetime - from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey +import pytesseract +import PIL +import io +import exiftool +import json +import os + +from .utils import make_request mapper_registry = registry() @@ -60,13 +67,14 @@ class TransformedResult: scraper: str transformer: str platform: str - channel: str + channel: int date: datetime date_archived: datetime url: str - content: str author_id: str author_username: str + content: str + analysis_table = Table('analysis', mapper_registry.metadata, @@ -76,13 +84,78 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('scraper', String), Column('transformer', String), Column('platform', String), - Column('channel', String), + Column('channel', Integer), Column('date', DateTime), Column('date_archived', DateTime), Column('url', String), - Column('content', String), Column('author_id', String), - Column('author_username', String) + Column('author_username', String), + Column('content', String) ) -mapper_registry.map_imperatively(TransformedResult, analysis_table) \ No newline at end of file +mapper_registry.map_imperatively(TransformedResult, analysis_table) + +@dataclass +class Media: + raw_id: int + post: int + url: str + original_url: str + + exif: str = None + + def get_blob(self): + blob = make_request(self.url) + return blob.content + + def hydrate(self, blob = None): + if blob is None: + blob = self.get_blob() + + self.hydrate_exif(blob) + + def hydrate_exif(self, blob): + f = open('tmp', 'wb') + f.write(blob) + f.close() + + with exiftool.ExifTool() as et: + exif = et.get_metadata('tmp') + self.exif = json.dumps(exif) + + os.remove('tmp') + +@dataclass +class Image(Media): + ocr: str = None + + def hydrate(self, blob=None): + if blob is None: + blob = self.get_blob() + + super().hydrate(blob) + self.hydrate_ocr(blob) + + def hydrate_ocr(self, blob): + image = PIL.Image.open(io.BytesIO(blob)) + self.ocr = pytesseract.image_to_string(image) + +@dataclass +class Video(Media): + pass + +media_table = Table('media', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('type', String), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('post', Integer, ForeignKey('analysis.id')), + Column('url', String), + Column('original_url', String), + Column('exif', String), + Column('ocr', String) + ) + +mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') +mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image') +mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 2d692e8..4f33931 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,4 +1,4 @@ -from .utils import make_request +from cisticola.utils import make_request from .base import Scraper, ScraperController from .bitchute import BitchuteScraper from .gab import GabScraper diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index ea68f70..11016e6 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -10,7 +10,7 @@ import ffmpeg from sqlalchemy.orm import sessionmaker from cisticola.base import Channel, ScraperResult, mapper_registry -from cisticola.scraper import make_request +from cisticola.utils import make_request class Scraper: __version__ = "Scraper 0.0.0" @@ -94,7 +94,6 @@ class ScraperController: def __init__(self): self.scrapers = [] self.session = None - self.mapper_registry = None def register_scraper(self, scraper: Scraper): self.scrapers.append(scraper) @@ -149,9 +148,3 @@ class ScraperController: self.session = sessionmaker() self.session.configure(bind=engine) - -class ETLController: - """This class will transform the raw_data tables into a format more conducive to analysis.""" - - def __init__(self): - pass diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 8005b4a..e320c4c 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,4 +1,8 @@ -from cisticola.base import ScraperResult, TransformedResult +from typing import List, Generator +from loguru import logger +from sqlalchemy.orm import sessionmaker + +from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry class Transformer: """Interface class for transformers""" @@ -11,6 +15,75 @@ class Transformer: def can_handle(data: ScraperResult) -> bool: pass + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + pass + def transform(data: ScraperResult) -> TransformedResult: pass + +class ETLController: + """This class will transform the raw_data tables into a format more conducive to analysis.""" + + def __init__(self): + self.transformers = [] + + def register_transformer(self, transformer: Transformer): + self.transformers.append(transformer) + + def connect_to_db(self, engine): + # create tables + mapper_registry.metadata.create_all(bind=engine) + + self.session = sessionmaker() + self.session.configure(bind=engine) + + @logger.catch + def transform_results(self, results: List[ScraperResult], hydrate: bool = True): + if self.session is None: + logger.error("No DB session") + return + + for result in results: + for transformer in self.transformers: + handled = False + + if transformer.can_handle(result): + logger.info(f"{transformer} is handling result {result}") + handled = True + session = self.session() + + transformed = transformer.transform(result) + + session.add(transformed) + session.flush() + + media = transformer.transform_media(result, transformed) + + count = 0 + for obj in media: + if hydrate: + logger.info(f"Hydrating {obj}") + obj.hydrate() + + session.add(obj) + count += 1 + + session.commit() + logger.info(f"{transformer} generated {count} media objects") + break + + if handled == False: + logger.warning(f"No Transformer could handle {result}") + + @logger.catch + def transform_all_untransformed(self, hydrate: bool = True): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all() + logger.info(f"Found {len(untransformed)} items to ETL") + + self.transform_results(untransformed, hydrate=hydrate) \ No newline at end of file diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 866a9fb..6c0838c 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -1,13 +1,51 @@ import json +from loguru import logger +from typing import Generator from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" __version__ = "TwitterTransformer 0.0.1" + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "TwitterScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + if raw['media']: + for media in raw['media']: + orig = None + + if media["_type"] == "snscrape.modules.twitter.Photo": + orig = media["fullUrl"] + elif media["_type"] == "snscrape.modules.twitter.Gif": + orig = media["variants"][0]["url"] + elif media["_type"] == "snscrape.modules.twitter.Video": + variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"]) + orig = variant["url"] + + if orig is None: + logger.warning(f"No media URL found for {media}") + elif orig not in data.archived_urls: + logger.info("Media discovered but not archived") + else: + new = data.archived_urls[orig] + + if media["_type"] == "snscrape.modules.twitter.Photo": + m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + else: + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + def transform(self, data: ScraperResult) -> TransformedResult: raw = json.loads(data.raw_data) diff --git a/cisticola/scraper/utils.py b/cisticola/utils.py similarity index 96% rename from cisticola/scraper/utils.py rename to cisticola/utils.py index 079bd1f..385884f 100644 --- a/cisticola/scraper/utils.py +++ b/cisticola/utils.py @@ -1,5 +1,6 @@ import requests from loguru import logger +import time def make_request(url, headers = None, max_retries = 5, break_codes = None): @@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None): while r.status_code not in break_codes and n_retries < 5: logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}") n_retries += 1 + + # back off subsequent requests + time.sleep(n_retries) r = requests.get(url, headers = headers) if r.status_code not in break_codes: diff --git a/test.py b/test.py index 8a2d624..b8b5f67 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,7 @@ from sqlalchemy import create_engine from loguru import logger -from cisticola.base import Channel +from cisticola.base import Channel, TransformedResult, ScraperResult from cisticola.scraper import ( ScraperController, BitchuteScraper, @@ -12,6 +12,9 @@ from cisticola.scraper import ( TelegramSnscrapeScraper, TelegramTelethonScraper, TwitterScraper) +from cisticola.transformer.base import ETLController +from cisticola.transformer.twitter import TwitterTransformer +from sqlalchemy.orm import sessionmaker logger.add("../test.log") @@ -29,87 +32,6 @@ test_channels = [ influencer=None, public=True, chat=False, - notes=""), - Channel( - id=1, - name="South West Ohio Proud Boys (test)", - platform_id=-1001276612436, - category="test", - followers=None, - platform="Telegram", - url="https://t.me/SouthwestOhioPB", - screenname="SouthwestOhioPB", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=2, - name="LizardRepublic (test)", - platform_id='lizardrepublic', - category="test", - followers=None, - platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", - screenname="lizardrepublic", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=4, - name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', - category="test", - followers=None, - platform="Bitchute", - url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=5, - name="Mak1n' Bacon (test)", - platform_id='Mak1nBacon', - category="test", - followers=None, - platform="Odysee", - url="https://odysee.com/@Mak1nBacon", - screenname='Mak1nBacon', - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=6, - name="Capt. Marc Simon (test)", - platform_id='marc_capt', - category="test", - followers=None, - platform="Gab", - url="https://gab.com/marc_capt", - screenname='marc_capt', - country="CA", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=7, - name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', - category="test", - followers=None, - platform="Rumble", - url="https://rumble.com/c/c-916305", - screenname='we are uploading', - country="CA", - influencer=None, - public=True, - chat=False, notes="")] controller = ScraperController() @@ -126,7 +48,14 @@ scrapers = [ controller.register_scrapers(scrapers) -engine = create_engine('sqlite:///test3.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) -controller.scrape_channels(test_channels, archive_media = False) \ No newline at end of file +controller.scrape_channels(test_channels, archive_media = True) + +transformer = TwitterTransformer() + +etl_controller = ETLController() +etl_controller.register_transformer(transformer) +etl_controller.connect_to_db(engine) +etl_controller.transform_all_untransformed()