From fd4b6177433a55458c4c309a0f2714aef23e1b0c Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Mon, 14 Mar 2022 13:33:55 +0100 Subject: [PATCH] Add TwitterTransformer test --- cisticola/base.py | 4 +-- cisticola/scraper/twitter.py | 4 +++ cisticola/transformer/__init__.py | 2 +- test.py | 10 +++--- tests/conftest.py | 53 ++++++++++++++++++++----------- tests/scraper/__init__.py | 0 tests/transformer/__init__.py | 0 tests/transformer/twitter.py | 30 +++++++++++++++++ 8 files changed, 76 insertions(+), 27 deletions(-) create mode 100644 tests/scraper/__init__.py create mode 100644 tests/transformer/__init__.py create mode 100644 tests/transformer/twitter.py diff --git a/cisticola/base.py b/cisticola/base.py index 94065a2..262089d 100644 --- a/cisticola/base.py +++ b/cisticola/base.py @@ -157,5 +157,5 @@ media_table = Table('media', mapper_registry.metadata, ) mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media') -mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image') -mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file +mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image') +mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video') \ No newline at end of file diff --git a/cisticola/scraper/twitter.py b/cisticola/scraper/twitter.py index b528383..c019e27 100644 --- a/cisticola/scraper/twitter.py +++ b/cisticola/scraper/twitter.py @@ -67,9 +67,13 @@ class TwitterScraper(Scraper): parsed_url = urlparse(url) queries = parse_qs(parsed_url.query) + ext = '' + # TODO might require additional statements for other media formats if 'jpg' in queries.get('format', []): ext = '.jpg' + elif 'png' in queries.get('format', []): + ext = '.png' elif parsed_url.path.endswith('.mp4'): ext = '' diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index 78cca55..7812b52 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,3 +1,3 @@ -from . import base +from .base import ETLController from .twitter import TwitterTransformer from .bitchute import BitchuteTransformer \ No newline at end of file diff --git a/test.py b/test.py index b8b5f67..ade611c 100644 --- a/test.py +++ b/test.py @@ -12,7 +12,7 @@ from cisticola.scraper import ( TelegramSnscrapeScraper, TelegramTelethonScraper, TwitterScraper) -from cisticola.transformer.base import ETLController +from cisticola.transformer import ETLController from cisticola.transformer.twitter import TwitterTransformer from sqlalchemy.orm import sessionmaker @@ -21,13 +21,13 @@ logger.add("../test.log") test_channels = [ Channel( id=0, - name="Logan Williams (test)", - platform_id=891729132, + name="L Weber (test)", + platform_id=1424979017749442595, category="test", followers=None, platform="Twitter", - url="https://twitter.com/obtusatum", - screenname="obtusatum", + url="https://twitter.com/LWeber33662141", + screenname="LWeber33662141", country="US", influencer=None, public=True, diff --git a/tests/conftest.py b/tests/conftest.py index 161439d..42548e9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import pytest from sqlalchemy import create_engine from cisticola.scraper import ScraperController +from cisticola.transformer import ETLController #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = { TWITTER_CHANNEL_KWARGS = { 'id': 5, - 'name': 'Logan Williams (test)', - 'platform_id': 891729132, + 'name': 'L Weber (test)', + 'platform_id': 1424979017749442595, 'category': 'test', 'followers': None, 'platform': 'Twitter', - 'url': 'https://twitter.com/obtusatum', - 'screenname': 'obtusatum', + 'url': 'https://twitter.com/LWeber33662141', + 'screenname': 'LWeber33662141', 'country': 'US', 'influencer': None, 'public': True, @@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = { #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# -@pytest.fixture(scope='package') -def controller(tmpdir_factory): - """Initialize ScraperController and SQLite database file to be used for all - tests in the package. - """ - +@pytest.fixture(scope='package') +def engine(tmpdir_factory): + """Initialize a SQLite database and SQLAlchemy engine to be used for all + tests in the package""" + file = tmpdir_factory.mktemp('test_data').join('test.db') engine = create_engine(f'sqlite:///{file}') + return engine + + +@pytest.fixture(scope='package') +def controller(engine): + """Initialize ScraperController to be used for all tests in the package.""" + scraper_controller = ScraperController() scraper_controller.connect_to_db(engine) return scraper_controller @pytest.fixture(scope='package') -def channel_kwargs(): +def etl_controller(engine): + """Initialize ETLController to be used for all tests in the package.""" + etl_controller = ETLController() + etl_controller.connect_to_db(engine) + + return etl_controller + +@pytest.fixture(scope='package') +def channel_kwargs(): """Define keyword arguments to use for defining test channels for each platform to be scraped. """ return { - 'bitchute' : BITCHUTE_CHANNEL_KWARGS, - 'gab' : GAB_CHANNEL_KWARGS, - 'gettr' : GETTR_CHANNEL_KWARGS, - 'odysee' : ODYSEE_CHANNEL_KWARGS, - 'rumble' : RUMBLE_CHANNEL_KWARGS, - 'telegram' : TELEGRAM_CHANNEL_KWARGS, - 'twitter' : TWITTER_CHANNEL_KWARGS} + 'bitchute': BITCHUTE_CHANNEL_KWARGS, + 'gab': GAB_CHANNEL_KWARGS, + 'gettr': GETTR_CHANNEL_KWARGS, + 'odysee': ODYSEE_CHANNEL_KWARGS, + 'rumble': RUMBLE_CHANNEL_KWARGS, + 'telegram': TELEGRAM_CHANNEL_KWARGS, + 'twitter': TWITTER_CHANNEL_KWARGS} -#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# diff --git a/tests/scraper/__init__.py b/tests/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/transformer/__init__.py b/tests/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py new file mode 100644 index 0000000..4ddeb68 --- /dev/null +++ b/tests/transformer/twitter.py @@ -0,0 +1,30 @@ +from sqlalchemy.orm import sessionmaker, with_polymorphic +import json + +from cisticola.base import Channel +from cisticola.scraper import TwitterScraper +from cisticola.transformer import TwitterTransformer +from cisticola.base import TransformedResult, Media + +def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['twitter'])] + controller.register_scraper(scraper = TwitterScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(TwitterTransformer()) + etl_controller.transform_all_untransformed() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(TransformedResult).all() + media = session.query(Media).all() + + assert len(posts) == 3 + assert len(media) == 2 + + assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6" + assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file