Add TwitterTransformer test

This commit is contained in:
Logan Williams
2022-03-14 13:33:55 +01:00
committed by Logan Williams
parent 3d919316a9
commit fd4b617743
8 changed files with 76 additions and 27 deletions

View File

@@ -157,5 +157,5 @@ media_table = Table('media', mapper_registry.metadata,
)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')

View File

@@ -67,9 +67,13 @@ class TwitterScraper(Scraper):
parsed_url = urlparse(url)
queries = parse_qs(parsed_url.query)
ext = ''
# TODO might require additional statements for other media formats
if 'jpg' in queries.get('format', []):
ext = '.jpg'
elif 'png' in queries.get('format', []):
ext = '.png'
elif parsed_url.path.endswith('.mp4'):
ext = ''

View File

@@ -1,3 +1,3 @@
from . import base
from .base import ETLController
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer

10
test.py
View File

@@ -12,7 +12,7 @@ from cisticola.scraper import (
TelegramSnscrapeScraper,
TelegramTelethonScraper,
TwitterScraper)
from cisticola.transformer.base import ETLController
from cisticola.transformer import ETLController
from cisticola.transformer.twitter import TwitterTransformer
from sqlalchemy.orm import sessionmaker
@@ -21,13 +21,13 @@ logger.add("../test.log")
test_channels = [
Channel(
id=0,
name="Logan Williams (test)",
platform_id=891729132,
name="L Weber (test)",
platform_id=1424979017749442595,
category="test",
followers=None,
platform="Twitter",
url="https://twitter.com/obtusatum",
screenname="obtusatum",
url="https://twitter.com/LWeber33662141",
screenname="LWeber33662141",
country="US",
influencer=None,
public=True,

View File

@@ -3,6 +3,7 @@ import pytest
from sqlalchemy import create_engine
from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = {
TWITTER_CHANNEL_KWARGS = {
'id': 5,
'name': 'Logan Williams (test)',
'platform_id': 891729132,
'name': 'L Weber (test)',
'platform_id': 1424979017749442595,
'category': 'test',
'followers': None,
'platform': 'Twitter',
'url': 'https://twitter.com/obtusatum',
'screenname': 'obtusatum',
'url': 'https://twitter.com/LWeber33662141',
'screenname': 'LWeber33662141',
'country': 'US',
'influencer': None,
'public': True,
@@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = {
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='package')
def controller(tmpdir_factory):
"""Initialize ScraperController and SQLite database file to be used for all
tests in the package.
"""
@pytest.fixture(scope='package')
def engine(tmpdir_factory):
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
tests in the package"""
file = tmpdir_factory.mktemp('test_data').join('test.db')
engine = create_engine(f'sqlite:///{file}')
return engine
@pytest.fixture(scope='package')
def controller(engine):
"""Initialize ScraperController to be used for all tests in the package."""
scraper_controller = ScraperController()
scraper_controller.connect_to_db(engine)
return scraper_controller
@pytest.fixture(scope='package')
def channel_kwargs():
def etl_controller(engine):
"""Initialize ETLController to be used for all tests in the package."""
etl_controller = ETLController()
etl_controller.connect_to_db(engine)
return etl_controller
@pytest.fixture(scope='package')
def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each
platform to be scraped.
"""
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS}
'bitchute': BITCHUTE_CHANNEL_KWARGS,
'gab': GAB_CHANNEL_KWARGS,
'gettr': GETTR_CHANNEL_KWARGS,
'odysee': ODYSEE_CHANNEL_KWARGS,
'rumble': RUMBLE_CHANNEL_KWARGS,
'telegram': TELEGRAM_CHANNEL_KWARGS,
'twitter': TWITTER_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

View File

View File

@@ -0,0 +1,30 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
import json
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
from cisticola.transformer import TwitterTransformer
from cisticola.base import TransformedResult, Media
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(TransformedResult).all()
media = session.query(Media).all()
assert len(posts) == 3
assert len(media) == 2
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"