Add TwitterTransformer test

2026-06-11 21:08:34 +03:00 · 2022-03-14 13:33:55 +01:00
parent 3d919316a9
commit fd4b617743
8 changed files with 76 additions and 27 deletions
--- a/cisticola/base.py
+++ b/cisticola/base.py
@@ -157,5 +157,5 @@ media_table = Table('media', mapper_registry.metadata,
                       )

 mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
-mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image')
-mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video')
+mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
+mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -67,9 +67,13 @@ class TwitterScraper(Scraper):
        parsed_url = urlparse(url)
        queries = parse_qs(parsed_url.query)

+        ext = ''
+
        # TODO might require additional statements for other media formats
        if 'jpg' in queries.get('format', []):
            ext = '.jpg'
+        elif 'png' in queries.get('format', []):
+            ext = '.png'
        elif parsed_url.path.endswith('.mp4'):
            ext = ''

--- a/cisticola/transformer/init.py
+++ b/cisticola/transformer/init.py
@@ -1,3 +1,3 @@
-from . import base 
+from .base import ETLController
 from .twitter import TwitterTransformer
 from .bitchute import BitchuteTransformer
--- a/test.py
+++ b/test.py
@@ -12,7 +12,7 @@ from cisticola.scraper import (
    TelegramSnscrapeScraper,
    TelegramTelethonScraper,
    TwitterScraper)
-from cisticola.transformer.base import ETLController
+from cisticola.transformer import ETLController
 from cisticola.transformer.twitter import TwitterTransformer
 from sqlalchemy.orm import sessionmaker

@@ -21,13 +21,13 @@ logger.add("../test.log")
 test_channels = [
    Channel(
        id=0, 
-        name="Logan Williams (test)", 
-        platform_id=891729132,
+        name="L Weber (test)", 
+        platform_id=1424979017749442595,
        category="test", 
        followers=None, 
        platform="Twitter",
-        url="https://twitter.com/obtusatum", 
-        screenname="obtusatum", 
+        url="https://twitter.com/LWeber33662141", 
+        screenname="LWeber33662141", 
        country="US",
        influencer=None, 
        public=True, 
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,6 +3,7 @@ import pytest
 from sqlalchemy import create_engine

 from cisticola.scraper import ScraperController
+from cisticola.transformer import ETLController

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

@@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = {

 TWITTER_CHANNEL_KWARGS = {
    'id': 5,
-    'name': 'Logan Williams (test)',
-    'platform_id': 891729132,
+    'name': 'L Weber (test)',
+    'platform_id': 1424979017749442595,
    'category': 'test',
    'followers': None,
    'platform': 'Twitter',
-    'url': 'https://twitter.com/obtusatum',
-    'screenname': 'obtusatum',
+    'url': 'https://twitter.com/LWeber33662141',
+    'screenname': 'LWeber33662141',
    'country': 'US',
    'influencer': None,
    'public': True,
@@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = {

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

-@pytest.fixture(scope='package')
-def controller(tmpdir_factory):

-    """Initialize ScraperController and SQLite database file to be used for all 
-    tests in the package. 
-    """
-    
+@pytest.fixture(scope='package')
+def engine(tmpdir_factory):
+    """Initialize a SQLite database and SQLAlchemy engine to be used for all
+    tests in the package"""
+
    file = tmpdir_factory.mktemp('test_data').join('test.db')
    engine = create_engine(f'sqlite:///{file}')
    
+    return engine
+
+
+@pytest.fixture(scope='package')
+def controller(engine):
+    """Initialize ScraperController to be used for all tests in the package."""
+
    scraper_controller = ScraperController()
    scraper_controller.connect_to_db(engine)

    return scraper_controller

@pytest.fixture(scope='package')
-def channel_kwargs():
+def etl_controller(engine):
+    """Initialize ETLController to be used for all tests in the package."""

+    etl_controller = ETLController()
+    etl_controller.connect_to_db(engine)
+
+    return etl_controller
+
+@pytest.fixture(scope='package')
+def channel_kwargs():
    """Define keyword arguments to use for defining test channels for each 
    platform to be scraped.
    """

    return {
-        'bitchute' : BITCHUTE_CHANNEL_KWARGS,
-        'gab' : GAB_CHANNEL_KWARGS,
-        'gettr' : GETTR_CHANNEL_KWARGS,
-        'odysee' : ODYSEE_CHANNEL_KWARGS,
-        'rumble' : RUMBLE_CHANNEL_KWARGS,
-        'telegram' : TELEGRAM_CHANNEL_KWARGS,
-        'twitter' : TWITTER_CHANNEL_KWARGS}
+        'bitchute': BITCHUTE_CHANNEL_KWARGS,
+        'gab': GAB_CHANNEL_KWARGS,
+        'gettr': GETTR_CHANNEL_KWARGS,
+        'odysee': ODYSEE_CHANNEL_KWARGS,
+        'rumble': RUMBLE_CHANNEL_KWARGS,
+        'telegram': TELEGRAM_CHANNEL_KWARGS,
+        'twitter': TWITTER_CHANNEL_KWARGS}

-#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/tests/scraper/init.py
+++ b/tests/scraper/init.py
--- a/tests/transformer/init.py
+++ b/tests/transformer/init.py
--- a/tests/transformer/twitter.py
+++ b/tests/transformer/twitter.py
@@ -0,0 +1,30 @@
+from sqlalchemy.orm import sessionmaker, with_polymorphic
+import json
+
+from cisticola.base import Channel
+from cisticola.scraper import TwitterScraper
+from cisticola.transformer import TwitterTransformer
+from cisticola.base import TransformedResult, Media
+
+def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
+    controller.reset_db()
+    
+    channels = [Channel(**channel_kwargs['twitter'])]
+    controller.register_scraper(scraper = TwitterScraper())
+    controller.scrape_channels(channels = channels, archive_media = True)
+
+    etl_controller.register_transformer(TwitterTransformer())
+    etl_controller.transform_all_untransformed()
+
+    sessionfactory = sessionmaker()
+    sessionfactory.configure(bind=engine)
+    session = sessionfactory()
+
+    posts = session.query(TransformedResult).all()
+    media = session.query(Media).all()
+
+    assert len(posts) == 3
+    assert len(media) == 2
+
+    assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
+    assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"