From fa5037d67c9fa2c11e940c5241746cfc788a4400 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Thu, 10 Mar 2022 15:34:24 +0100 Subject: [PATCH 1/5] Implement transformer for TwitterScraper that handles media; implement image OCR and EXIF extraction --- Pipfile | 2 + cisticola/base.py | 87 +++++++++++++++++++++++++--- cisticola/scraper/__init__.py | 2 +- cisticola/scraper/base.py | 9 +-- cisticola/transformer/base.py | 75 +++++++++++++++++++++++- cisticola/transformer/twitter.py | 40 ++++++++++++- cisticola/{scraper => }/utils.py | 4 ++ test.py | 97 +++++--------------------------- 8 files changed, 214 insertions(+), 102 deletions(-) rename cisticola/{scraper => }/utils.py (96%) diff --git a/Pipfile b/Pipfile index 62f2c74..5f86225 100644 --- a/Pipfile +++ b/Pipfile @@ -18,6 +18,8 @@ polyphemus = {git = "https://github.com/bellingcat/polyphemus.git"} garc = "*" youtube-dl = "*" telethon = "*" +pytesseract = "*" +pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"} [dev-packages] pytest = "*" diff --git a/cisticola/base.py b/cisticola/base.py index 97a18df..94065a2 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -1,9 +1,16 @@ from typing import List from dataclasses import dataclass from datetime import datetime - from sqlalchemy.orm import registry from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey +import pytesseract +import PIL +import io +import exiftool +import json +import os + +from .utils import make_request mapper_registry = registry() @@ -60,13 +67,14 @@ class TransformedResult: scraper: str transformer: str platform: str - channel: str + channel: int date: datetime date_archived: datetime url: str - content: str author_id: str author_username: str + content: str + analysis_table = Table('analysis', mapper_registry.metadata, @@ -76,13 +84,78 @@ analysis_table = Table('analysis', mapper_registry.metadata, Column('scraper', String), Column('transformer', String), Column('platform', String), - Column('channel', String), + Column('channel', Integer), Column('date', DateTime), Column('date_archived', DateTime), Column('url', String), - Column('content', String), Column('author_id', String), - Column('author_username', String) + Column('author_username', String), + Column('content', String) ) -mapper_registry.map_imperatively(TransformedResult, analysis_table) \ No newline at end of file +mapper_registry.map_imperatively(TransformedResult, analysis_table) + +@dataclass +class Media: + raw_id: int + post: int + url: str + original_url: str + + exif: str = None + + def get_blob(self): + blob = make_request(self.url) + return blob.content + + def hydrate(self, blob = None): + if blob is None: + blob = self.get_blob() + + self.hydrate_exif(blob) + + def hydrate_exif(self, blob): + f = open('tmp', 'wb') + f.write(blob) + f.close() + + with exiftool.ExifTool() as et: + exif = et.get_metadata('tmp') + self.exif = json.dumps(exif) + + os.remove('tmp') + +@dataclass +class Image(Media): + ocr: str = None + + def hydrate(self, blob=None): + if blob is None: + blob = self.get_blob() + + super().hydrate(blob) + self.hydrate_ocr(blob) + + def hydrate_ocr(self, blob): + image = PIL.Image.open(io.BytesIO(blob)) + self.ocr = pytesseract.image_to_string(image) + +@dataclass +class Video(Media): + pass + +media_table = Table('media', mapper_registry.metadata, + Column('id', Integer, primary_key=True, + autoincrement=True), + Column('type', String), + Column('raw_id', Integer, ForeignKey('raw_data.id')), + Column('post', Integer, ForeignKey('analysis.id')), + Column('url', String), + Column('original_url', String), + Column('exif', String), + Column('ocr', String) + ) + +mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') +mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image') +mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 2d692e8..4f33931 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -1,4 +1,4 @@ -from .utils import make_request +from cisticola.utils import make_request from .base import Scraper, ScraperController from .bitchute import BitchuteScraper from .gab import GabScraper diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index ea68f70..11016e6 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -10,7 +10,7 @@ import ffmpeg from sqlalchemy.orm import sessionmaker from cisticola.base import Channel, ScraperResult, mapper_registry -from cisticola.scraper import make_request +from cisticola.utils import make_request class Scraper: __version__ = "Scraper 0.0.0" @@ -94,7 +94,6 @@ class ScraperController: def __init__(self): self.scrapers = [] self.session = None - self.mapper_registry = None def register_scraper(self, scraper: Scraper): self.scrapers.append(scraper) @@ -149,9 +148,3 @@ class ScraperController: self.session = sessionmaker() self.session.configure(bind=engine) - -class ETLController: - """This class will transform the raw_data tables into a format more conducive to analysis.""" - - def __init__(self): - pass diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index 8005b4a..e320c4c 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,4 +1,8 @@ -from cisticola.base import ScraperResult, TransformedResult +from typing import List, Generator +from loguru import logger +from sqlalchemy.orm import sessionmaker + +from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry class Transformer: """Interface class for transformers""" @@ -11,6 +15,75 @@ class Transformer: def can_handle(data: ScraperResult) -> bool: pass + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + pass + def transform(data: ScraperResult) -> TransformedResult: pass + +class ETLController: + """This class will transform the raw_data tables into a format more conducive to analysis.""" + + def __init__(self): + self.transformers = [] + + def register_transformer(self, transformer: Transformer): + self.transformers.append(transformer) + + def connect_to_db(self, engine): + # create tables + mapper_registry.metadata.create_all(bind=engine) + + self.session = sessionmaker() + self.session.configure(bind=engine) + + @logger.catch + def transform_results(self, results: List[ScraperResult], hydrate: bool = True): + if self.session is None: + logger.error("No DB session") + return + + for result in results: + for transformer in self.transformers: + handled = False + + if transformer.can_handle(result): + logger.info(f"{transformer} is handling result {result}") + handled = True + session = self.session() + + transformed = transformer.transform(result) + + session.add(transformed) + session.flush() + + media = transformer.transform_media(result, transformed) + + count = 0 + for obj in media: + if hydrate: + logger.info(f"Hydrating {obj}") + obj.hydrate() + + session.add(obj) + count += 1 + + session.commit() + logger.info(f"{transformer} generated {count} media objects") + break + + if handled == False: + logger.warning(f"No Transformer could handle {result}") + + @logger.catch + def transform_all_untransformed(self, hydrate: bool = True): + if self.session is None: + logger.error("No DB session") + return + + session = self.session() + untransformed = session.query(ScraperResult).join(TransformedResult, isouter=True).where(TransformedResult.raw_id == None).all() + logger.info(f"Found {len(untransformed)} items to ETL") + + self.transform_results(untransformed, hydrate=hydrate) \ No newline at end of file diff --git a/cisticola/transformer/twitter.py b/cisticola/transformer/twitter.py index 866a9fb..6c0838c 100644 --- a/cisticola/transformer/twitter.py +++ b/cisticola/transformer/twitter.py @@ -1,13 +1,51 @@ import json +from loguru import logger +from typing import Generator from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, TransformedResult +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media class TwitterTransformer(Transformer): """A Twitter specific ScraperResult, with a method ETL/transforming""" __version__ = "TwitterTransformer 0.0.1" + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "TwitterScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + if raw['media']: + for media in raw['media']: + orig = None + + if media["_type"] == "snscrape.modules.twitter.Photo": + orig = media["fullUrl"] + elif media["_type"] == "snscrape.modules.twitter.Gif": + orig = media["variants"][0]["url"] + elif media["_type"] == "snscrape.modules.twitter.Video": + variant = max([v for v in media["variants"] if v["bitrate"]], key=lambda v: v["bitrate"]) + orig = variant["url"] + + if orig is None: + logger.warning(f"No media URL found for {media}") + elif orig not in data.archived_urls: + logger.info("Media discovered but not archived") + else: + new = data.archived_urls[orig] + + if media["_type"] == "snscrape.modules.twitter.Photo": + m = Image(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + else: + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + def transform(self, data: ScraperResult) -> TransformedResult: raw = json.loads(data.raw_data) diff --git a/cisticola/scraper/utils.py b/cisticola/utils.py similarity index 96% rename from cisticola/scraper/utils.py rename to cisticola/utils.py index 079bd1f..385884f 100644 --- a/cisticola/scraper/utils.py +++ b/cisticola/utils.py @@ -1,5 +1,6 @@ import requests from loguru import logger +import time def make_request(url, headers = None, max_retries = 5, break_codes = None): @@ -64,6 +65,9 @@ def request_until_200(url, headers = None, max_retries = 5, break_codes = None): while r.status_code not in break_codes and n_retries < 5: logger.warning(f"Request for url: {url} returned status: {r.status_code} on attempt: {n_retries}/{max_retries}") n_retries += 1 + + # back off subsequent requests + time.sleep(n_retries) r = requests.get(url, headers = headers) if r.status_code not in break_codes: diff --git a/test.py b/test.py index 8a2d624..b8b5f67 100644 --- a/test.py +++ b/test.py @@ -1,7 +1,7 @@ from sqlalchemy import create_engine from loguru import logger -from cisticola.base import Channel +from cisticola.base import Channel, TransformedResult, ScraperResult from cisticola.scraper import ( ScraperController, BitchuteScraper, @@ -12,6 +12,9 @@ from cisticola.scraper import ( TelegramSnscrapeScraper, TelegramTelethonScraper, TwitterScraper) +from cisticola.transformer.base import ETLController +from cisticola.transformer.twitter import TwitterTransformer +from sqlalchemy.orm import sessionmaker logger.add("../test.log") @@ -29,87 +32,6 @@ test_channels = [ influencer=None, public=True, chat=False, - notes=""), - Channel( - id=1, - name="South West Ohio Proud Boys (test)", - platform_id=-1001276612436, - category="test", - followers=None, - platform="Telegram", - url="https://t.me/SouthwestOhioPB", - screenname="SouthwestOhioPB", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=2, - name="LizardRepublic (test)", - platform_id='lizardrepublic', - category="test", - followers=None, - platform="Gettr", - url="https://www.gettr.com/user/lizardrepublic", - screenname="lizardrepublic", - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=4, - name="bestonlinejewelrystoresusa@gmail.com (test)", platform_id='bestonlinejewelrystoresusagmailcom', - category="test", - followers=None, - platform="Bitchute", - url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=5, - name="Mak1n' Bacon (test)", - platform_id='Mak1nBacon', - category="test", - followers=None, - platform="Odysee", - url="https://odysee.com/@Mak1nBacon", - screenname='Mak1nBacon', - country="US", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=6, - name="Capt. Marc Simon (test)", - platform_id='marc_capt', - category="test", - followers=None, - platform="Gab", - url="https://gab.com/marc_capt", - screenname='marc_capt', - country="CA", - influencer=None, - public=True, - chat=False, - notes=""), - Channel( - id=7, - name="we are uploading videos wow products and problem solving products.please share like and subscribe our channelwe are uploading videos wow products and problem solving products.please share like and subscribe our channel", platform_id='c-916305', - category="test", - followers=None, - platform="Rumble", - url="https://rumble.com/c/c-916305", - screenname='we are uploading', - country="CA", - influencer=None, - public=True, - chat=False, notes="")] controller = ScraperController() @@ -126,7 +48,14 @@ scrapers = [ controller.register_scrapers(scrapers) -engine = create_engine('sqlite:///test3.db') +engine = create_engine('sqlite:///test.db') controller.connect_to_db(engine) -controller.scrape_channels(test_channels, archive_media = False) \ No newline at end of file +controller.scrape_channels(test_channels, archive_media = True) + +transformer = TwitterTransformer() + +etl_controller = ETLController() +etl_controller.register_transformer(transformer) +etl_controller.connect_to_db(engine) +etl_controller.transform_all_untransformed() From 5783206ad87f20c1ed09da77bac0c0081ba9a579 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 10 Mar 2022 10:20:49 -0600 Subject: [PATCH 2/5] implemented method to reset database, to enable the 'contoller' fixture scope to be shared across the whole package, which will enable the transformer tests to be run without re-running the scrapers --- Pipfile.lock | 67 ++++++++++++++++++++++++++---- cisticola/scraper/base.py | 9 +++- tests/conftest.py | 2 +- tests/scraper/bitchute.py | 2 + tests/scraper/gab.py | 2 + tests/scraper/gettr.py | 2 + tests/scraper/odysee.py | 2 + tests/scraper/rumble.py | 2 + tests/scraper/telegram_snscrape.py | 2 + tests/scraper/telegram_telethon.py | 2 + tests/scraper/twitter.py | 2 + 11 files changed, 85 insertions(+), 9 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 0ca0eda..5a75176 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3d293e1f3802d64ae7a8fbfc4c1d742cc33cd4c520a6263f93e566f89faa7013" + "sha256": "afacc6dd45c110f235861c54db45f5546fb0095f4e68a1084e85fd0e902db21c" }, "pipfile-spec": 6, "requires": { @@ -49,19 +49,19 @@ }, "boto3": { "hashes": [ - "sha256:30394729b38d5ce2f845440428a55161c6d45478044e553a12ca1acf56d7278a", - "sha256:895489900eb882777124c3b64a13df49785cf77f7bd1504e783464fb3b4c8163" + "sha256:15fa6d1acac422d2d34f7811e02acfc7ac222cea24db3f463d5c52f2f87baa52", + "sha256:c974a7fa781c500b7067441f9883ed939cf8c80bcd74c88b11965b336cabb4b6" ], "index": "pypi", - "version": "==1.21.15" + "version": "==1.21.16" }, "botocore": { "hashes": [ - "sha256:405082f92a9e524e1aee96cbc90134668026d7da3c12f86990c91a12620ca28b", - "sha256:fa4816e94e72111a9341204061e760bcbde74ca5d900d3f2206c2c2e8e4b56e4" + "sha256:0a809efb821d81dc29f2e6c404ed123176b8d2eb43103758f31d89b291af2a8b", + "sha256:dcff7f9b5fea98701d0b520eba99385c538825f10e6d1cab1e7da213293d141e" ], "markers": "python_version >= '3.6'", - "version": "==1.24.15" + "version": "==1.24.16" }, "bs4": { "hashes": [ @@ -436,6 +436,47 @@ "markers": "python_version >= '3.8'", "version": "==1.4.1" }, + "pillow": { + "hashes": [ + "sha256:011233e0c42a4a7836498e98c1acf5e744c96a67dd5032a6f666cc1fb97eab97", + "sha256:0f29d831e2151e0b7b39981756d201f7108d3d215896212ffe2e992d06bfe049", + "sha256:12875d118f21cf35604176872447cdb57b07126750a33748bac15e77f90f1f9c", + "sha256:14d4b1341ac07ae07eb2cc682f459bec932a380c3b122f5540432d8977e64eae", + "sha256:1c3c33ac69cf059bbb9d1a71eeaba76781b450bc307e2291f8a4764d779a6b28", + "sha256:1d19397351f73a88904ad1aee421e800fe4bbcd1aeee6435fb62d0a05ccd1030", + "sha256:253e8a302a96df6927310a9d44e6103055e8fb96a6822f8b7f514bb7ef77de56", + "sha256:2632d0f846b7c7600edf53c48f8f9f1e13e62f66a6dbc15191029d950bfed976", + "sha256:335ace1a22325395c4ea88e00ba3dc89ca029bd66bd5a3c382d53e44f0ccd77e", + "sha256:413ce0bbf9fc6278b2d63309dfeefe452835e1c78398efb431bab0672fe9274e", + "sha256:5100b45a4638e3c00e4d2320d3193bdabb2d75e79793af7c3eb139e4f569f16f", + "sha256:514ceac913076feefbeaf89771fd6febde78b0c4c1b23aaeab082c41c694e81b", + "sha256:528a2a692c65dd5cafc130de286030af251d2ee0483a5bf50c9348aefe834e8a", + "sha256:6295f6763749b89c994fcb6d8a7f7ce03c3992e695f89f00b741b4580b199b7e", + "sha256:6c8bc8238a7dfdaf7a75f5ec5a663f4173f8c367e5a39f87e720495e1eed75fa", + "sha256:718856856ba31f14f13ba885ff13874be7fefc53984d2832458f12c38205f7f7", + "sha256:7f7609a718b177bf171ac93cea9fd2ddc0e03e84d8fa4e887bdfc39671d46b00", + "sha256:80ca33961ced9c63358056bd08403ff866512038883e74f3a4bf88ad3eb66838", + "sha256:80fe64a6deb6fcfdf7b8386f2cf216d329be6f2781f7d90304351811fb591360", + "sha256:81c4b81611e3a3cb30e59b0cf05b888c675f97e3adb2c8672c3154047980726b", + "sha256:855c583f268edde09474b081e3ddcd5cf3b20c12f26e0d434e1386cc5d318e7a", + "sha256:9bfdb82cdfeccec50aad441afc332faf8606dfa5e8efd18a6692b5d6e79f00fd", + "sha256:a5d24e1d674dd9d72c66ad3ea9131322819ff86250b30dc5821cbafcfa0b96b4", + "sha256:a9f44cd7e162ac6191491d7249cceb02b8116b0f7e847ee33f739d7cb1ea1f70", + "sha256:b5b3f092fe345c03bca1e0b687dfbb39364b21ebb8ba90e3fa707374b7915204", + "sha256:b9618823bd237c0d2575283f2939655f54d51b4527ec3972907a927acbcc5bfc", + "sha256:cef9c85ccbe9bee00909758936ea841ef12035296c748aaceee535969e27d31b", + "sha256:d21237d0cd37acded35154e29aec853e945950321dd2ffd1a7d86fe686814669", + "sha256:d3c5c79ab7dfce6d88f1ba639b77e77a17ea33a01b07b99840d6ed08031cb2a7", + "sha256:d9d7942b624b04b895cb95af03a23407f17646815495ce4547f0e60e0b06f58e", + "sha256:db6d9fac65bd08cea7f3540b899977c6dee9edad959fa4eaf305940d9cbd861c", + "sha256:ede5af4a2702444a832a800b8eb7f0a7a1c0eed55b644642e049c98d589e5092", + "sha256:effb7749713d5317478bb3acb3f81d9d7c7f86726d41c1facca068a04cf5bb4c", + "sha256:f154d173286a5d1863637a7dcd8c3437bb557520b01bddb0be0258dcb72696b5", + "sha256:f25ed6e28ddf50de7e7ea99d7a976d6a9c415f03adcaac9c41ff6ff41b6d86ac" + ], + "markers": "python_version >= '3.7'", + "version": "==9.0.1" + }, "pluggy": { "hashes": [ "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159", @@ -480,6 +521,10 @@ ], "version": "==0.4.8" }, + "pyexiftool": { + "git": "https://github.com/smarnach/pyexiftool.git", + "ref": "3db3764895e687d75b42d3ae4e554ca8664a7f6f" + }, "pygments": { "hashes": [ "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65", @@ -504,6 +549,14 @@ ], "version": "==1.7.1" }, + "pytesseract": { + "hashes": [ + "sha256:7e2bafc7f48d1bb71443ce4633a56f5e21925a98f220a36c336297edcd1956d0", + "sha256:fecda37d1e4eaf744c657cd03a5daab4eb97c61506ac5550274322c8ae32eca2" + ], + "index": "pypi", + "version": "==0.3.9" + }, "pytest": { "hashes": [ "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db", diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index 11016e6..f2eae25 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -146,5 +146,12 @@ class ScraperController: mapper_registry.metadata.create_all(bind=engine) self.session = sessionmaker() - self.session.configure(bind=engine) + self.engine = engine + self.session.configure(bind=self.engine) + + def reset_db(self): + + mapper_registry.metadata.drop_all(bind=self.engine) + self.connect_to_db(self.engine) + diff --git a/tests/conftest.py b/tests/conftest.py index 0608903..161439d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -113,7 +113,7 @@ TWITTER_CHANNEL_KWARGS = { #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.fixture(scope='function') +@pytest.fixture(scope='package') def controller(tmpdir_factory): """Initialize ScraperController and SQLite database file to be used for all diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py index bc64c4b..c32e840 100644 --- a/tests/scraper/bitchute.py +++ b/tests/scraper/bitchute.py @@ -9,6 +9,8 @@ def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): def test_scrape_bitchute_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['bitchute'])] controller.register_scraper(scraper = BitchuteScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index 29fa34a..c864c37 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -8,6 +8,8 @@ def test_scrape_gab_channel_no_media(controller, channel_kwargs): controller.scrape_channels(channels = channels, archive_media = False) def test_scrape_gab_channel(controller, channel_kwargs): + + controller.reset_db() channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 186b74c..7dd2f24 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -9,6 +9,8 @@ def test_scrape_gettr_channel_no_media(controller, channel_kwargs): def test_scrape_gettr_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index 8b9f89a..f97700e 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -9,6 +9,8 @@ def test_scrape_odysee_channel_no_media(controller, channel_kwargs): def test_scrape_odysee_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index daf59f6..5f640e5 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -9,6 +9,8 @@ def test_scrape_rumble_channel_no_media(controller, channel_kwargs): def test_scrape_rumble_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index af25ed7..3848780 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -9,6 +9,8 @@ def test_scrape_telegram_snscrape_channel_no_media(controller, channel_kwargs): def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index 1cfc529..c015631 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -9,6 +9,8 @@ def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): def test_scrape_telegram_telethon_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index ef375b7..bd79a6a 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -9,6 +9,8 @@ def test_scrape_twitter_channel_no_media(controller, channel_kwargs): def test_scrape_twitter_channel(controller, channel_kwargs): + controller.reset_db() + channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) From 3d919316a988c20944c27cc3996a50b7b1b63c43 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 10 Mar 2022 13:03:01 -0600 Subject: [PATCH 3/5] added Bitchute scraper, minor change to Bitchute scraper to correctly extract author name and id --- cisticola/scraper/bitchute.py | 4 +-- cisticola/transformer/__init__.py | 3 +- cisticola/transformer/bitchute.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 cisticola/transformer/bitchute.py diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index a5292aa..8a365f4 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -236,8 +236,8 @@ def append_details(video, detail): video["video_url"] = soup.select_one("video#player source").get("src") video["thumbnail_image"] = soup.select_one("video#player").get("poster") video["subject"] = soup.select_one("h1#video-title").text - video["author"] = soup.select_one("div.channel-banner p.name a").text - video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] + video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2] + video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() # we need *two more requests* to get the comment count and like/dislike counts diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index e3a4b49..78cca55 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,2 +1,3 @@ from . import base -from .twitter import TwitterTransformer \ No newline at end of file +from .twitter import TwitterTransformer +from .bitchute import BitchuteTransformer \ No newline at end of file diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py new file mode 100644 index 0000000..de6f0a7 --- /dev/null +++ b/cisticola/transformer/bitchute.py @@ -0,0 +1,51 @@ +import json +from loguru import logger +from typing import Generator + +from bs4 import BeautifulSoup + +from cisticola.transformer.base import Transformer +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media + +class BitchuteTransformer(Transformer): + """A Bitchute specific ScraperResult, with a method ETL/transforming""" + + __version__ = "BitchuteTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "BitchuteScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + orig = raw['video_url'] + new = data.archived_urls[orig] + + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + + def transform(self, data: ScraperResult) -> TransformedResult: + raw = json.loads(data.raw_data) + + soup = BeautifulSoup(raw['body'], features = 'html.parser') + content = soup.find_all('p')[-1].text + + transformed = TransformedResult( + raw_id=data.id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + url=raw['url'], + content=content, + author_id=raw['author_id'], + author_username=raw['author']) + + return transformed From fd4b6177433a55458c4c309a0f2714aef23e1b0c Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 14 Mar 2022 13:33:55 +0100 Subject: [PATCH 4/5] Add TwitterTransformer test --- cisticola/base.py | 4 +-- cisticola/scraper/twitter.py | 4 +++ cisticola/transformer/__init__.py | 2 +- test.py | 10 +++--- tests/conftest.py | 53 ++++++++++++++++++++----------- tests/scraper/__init__.py | 0 tests/transformer/__init__.py | 0 tests/transformer/twitter.py | 30 +++++++++++++++++ 8 files changed, 76 insertions(+), 27 deletions(-) create mode 100644 tests/scraper/__init__.py create mode 100644 tests/transformer/__init__.py create mode 100644 tests/transformer/twitter.py diff --git a/cisticola/base.py b/cisticola/base.py index 94065a2..262089d 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -157,5 +157,5 @@ media_table = Table('media', mapper_registry.metadata, ) mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') -mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image') -mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file +mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') +mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index b528383..c019e27 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -67,9 +67,13 @@ class TwitterScraper(Scraper): parsed_url = urlparse(url) queries = parse_qs(parsed_url.query) + ext = '' + # TODO might require additional statements for other media formats if 'jpg' in queries.get('format', []): ext = '.jpg' + elif 'png' in queries.get('format', []): + ext = '.png' elif parsed_url.path.endswith('.mp4'): ext = '' diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 78cca55..7812b52 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,3 +1,3 @@ -from . import base +from .base import ETLController from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer \ No newline at end of file diff --git a/test.py b/test.py index b8b5f67..ade611c 100644 --- a/test.py +++ b/test.py @@ -12,7 +12,7 @@ from cisticola.scraper import ( TelegramSnscrapeScraper, TelegramTelethonScraper, TwitterScraper) -from cisticola.transformer.base import ETLController +from cisticola.transformer import ETLController from cisticola.transformer.twitter import TwitterTransformer from sqlalchemy.orm import sessionmaker @@ -21,13 +21,13 @@ logger.add("../test.log") test_channels = [ Channel( id=0, - name="Logan Williams (test)", - platform_id=891729132, + name="L Weber (test)", + platform_id=1424979017749442595, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", - screenname="obtusatum", + url="https://twitter.com/LWeber33662141", + screenname="LWeber33662141", country="US", influencer=None, public=True, diff --git a/tests/conftest.py b/tests/conftest.py index 161439d..42548e9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import create_engine from cisticola.scraper import ScraperController +from cisticola.transformer import ETLController #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = { TWITTER_CHANNEL_KWARGS = { 'id': 5, - 'name': 'Logan Williams (test)', - 'platform_id': 891729132, + 'name': 'L Weber (test)', + 'platform_id': 1424979017749442595, 'category': 'test', 'followers': None, 'platform': 'Twitter', - 'url': 'https://twitter.com/obtusatum', - 'screenname': 'obtusatum', + 'url': 'https://twitter.com/LWeber33662141', + 'screenname': 'LWeber33662141', 'country': 'US', 'influencer': None, 'public': True, @@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = { #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.fixture(scope='package') -def controller(tmpdir_factory): - """Initialize ScraperController and SQLite database file to be used for all - tests in the package. - """ - +@pytest.fixture(scope='package') +def engine(tmpdir_factory): + """Initialize a SQLite database and SQLAlchemy engine to be used for all + tests in the package""" + file = tmpdir_factory.mktemp('test_data').join('test.db') engine = create_engine(f'sqlite:///{file}') + return engine + + +@pytest.fixture(scope='package') +def controller(engine): + """Initialize ScraperController to be used for all tests in the package.""" + scraper_controller = ScraperController() scraper_controller.connect_to_db(engine) return scraper_controller @pytest.fixture(scope='package') -def channel_kwargs(): +def etl_controller(engine): + """Initialize ETLController to be used for all tests in the package.""" + etl_controller = ETLController() + etl_controller.connect_to_db(engine) + + return etl_controller + +@pytest.fixture(scope='package') +def channel_kwargs(): """Define keyword arguments to use for defining test channels for each platform to be scraped. """ return { - 'bitchute' : BITCHUTE_CHANNEL_KWARGS, - 'gab' : GAB_CHANNEL_KWARGS, - 'gettr' : GETTR_CHANNEL_KWARGS, - 'odysee' : ODYSEE_CHANNEL_KWARGS, - 'rumble' : RUMBLE_CHANNEL_KWARGS, - 'telegram' : TELEGRAM_CHANNEL_KWARGS, - 'twitter' : TWITTER_CHANNEL_KWARGS} + 'bitchute': BITCHUTE_CHANNEL_KWARGS, + 'gab': GAB_CHANNEL_KWARGS, + 'gettr': GETTR_CHANNEL_KWARGS, + 'odysee': ODYSEE_CHANNEL_KWARGS, + 'rumble': RUMBLE_CHANNEL_KWARGS, + 'telegram': TELEGRAM_CHANNEL_KWARGS, + 'twitter': TWITTER_CHANNEL_KWARGS} -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/scraper/__init__.py b/tests/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/transformer/__init__.py b/tests/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py new file mode 100644 index 0000000..4ddeb68 --- /dev/null +++ b/tests/transformer/twitter.py @@ -0,0 +1,30 @@ +from sqlalchemy.orm import sessionmaker, with_polymorphic +import json + +from cisticola.base import Channel +from cisticola.scraper import TwitterScraper +from cisticola.transformer import TwitterTransformer +from cisticola.base import TransformedResult, Media + +def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['twitter'])] + controller.register_scraper(scraper = TwitterScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(TwitterTransformer()) + etl_controller.transform_all_untransformed() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(TransformedResult).all() + media = session.query(Media).all() + + assert len(posts) == 3 + assert len(media) == 2 + + assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6" + assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file From fe0d762df0dcc8f96207b8cc1fd51261a7f4c5a1 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 14 Mar 2022 14:02:57 +0100 Subject: [PATCH 5/5] Add Transformer and ETLController docstrings --- cisticola/transformer/base.py | 88 +++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index e320c4c..ce6bb59 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -1,11 +1,12 @@ from typing import List, Generator from loguru import logger from sqlalchemy.orm import sessionmaker +from sqlalchemy.engine.base import Engine from cisticola.base import ScraperResult, TransformedResult, Media, mapper_registry class Transformer: - """Interface class for transformers""" + """Interface class for transformers.""" __version__ = "Transformer 0.0.0" @@ -13,25 +14,87 @@ class Transformer: pass def can_handle(data: ScraperResult) -> bool: + """Specifies whether or not a Transformer is capable of handling a particular + piece of scraped data. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to check for ability to handle. + + Returns + ------- + bool + True if it can be handled by this Transformer, false otherwise. + """ + pass def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + """Yields Media objects from each piece of media present in a raw ScraperResult. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to process + transformed : TransformedResult + The TransformedResult version of `data`. (E.g. as generated by `Transformer.transform()`) + + Yields + ------ + Media + A media object generated from the ScraperResult. One ScraperResult can have multiple pieces + of media contained within it, so this can generate an arbitrary number of Media objects + (or their subclasses.) These Media objects are not fully hydrated. + """ + pass def transform(data: ScraperResult) -> TransformedResult: + """Transform a ScraperResult into a TransformedResult object. This extracts additional attributes + that can be used directly for analysis. + + Parameters + ---------- + data : ScraperResult + The ScraperResult object to process. + + Returns + ------- + TransformedResult + A TransformedResult representation of the `data` object. + """ + pass class ETLController: - """This class will transform the raw_data tables into a format more conducive to analysis.""" + """An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format + for analysis by using Transformer objects that have been registered with the controller. + """ def __init__(self): self.transformers = [] def register_transformer(self, transformer: Transformer): + """Adds a Transformer to the list of available Transformers. + + Parameters + ---------- + transformer : Transformer + The Transformer to register + """ + self.transformers.append(transformer) - def connect_to_db(self, engine): + def connect_to_db(self, engine: Engine): + """Connects the ETLController to a SQLAlchemy engine. + + Parameters + ---------- + engine : Engine + SQLAlchemy Engine object + """ # create tables mapper_registry.metadata.create_all(bind=engine) @@ -40,6 +103,16 @@ class ETLController: @logger.catch def transform_results(self, results: List[ScraperResult], hydrate: bool = True): + """Transforms raw ScraperResults objects into TransformedResult objects and + Media objects. Then, adds them to the database. + + Parameters + ---------- + results : List[ScraperResult] + A list of ScraperResult objects to be transformed + hydrate : bool + Whether or not to fully hydrate transformed media. Default True. + """ if self.session is None: logger.error("No DB session") return @@ -78,6 +151,15 @@ class ETLController: @logger.catch def transform_all_untransformed(self, hydrate: bool = True): + """Transform all ScraperResult objects in the database that do not have an + equivalent TransformedResult object stored. + + Parameters + ---------- + hydrate : bool + Whether or not to fully hydrate transformed media. Default True. + """ + if self.session is None: logger.error("No DB session") return