mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 21:08:34 +03:00
Add TwitterTransformer test
This commit is contained in:
committed by
Logan Williams
parent
3d919316a9
commit
fd4b617743
@@ -157,5 +157,5 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
|
||||
mapper_registry.map_imperatively(Image, media_table, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, polymorphic_on='type', polymorphic_identity='video')
|
||||
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')
|
||||
mapper_registry.map_imperatively(Video, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='video')
|
||||
@@ -67,9 +67,13 @@ class TwitterScraper(Scraper):
|
||||
parsed_url = urlparse(url)
|
||||
queries = parse_qs(parsed_url.query)
|
||||
|
||||
ext = ''
|
||||
|
||||
# TODO might require additional statements for other media formats
|
||||
if 'jpg' in queries.get('format', []):
|
||||
ext = '.jpg'
|
||||
elif 'png' in queries.get('format', []):
|
||||
ext = '.png'
|
||||
elif parsed_url.path.endswith('.mp4'):
|
||||
ext = ''
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
from . import base
|
||||
from .base import ETLController
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
10
test.py
10
test.py
@@ -12,7 +12,7 @@ from cisticola.scraper import (
|
||||
TelegramSnscrapeScraper,
|
||||
TelegramTelethonScraper,
|
||||
TwitterScraper)
|
||||
from cisticola.transformer.base import ETLController
|
||||
from cisticola.transformer import ETLController
|
||||
from cisticola.transformer.twitter import TwitterTransformer
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
@@ -21,13 +21,13 @@ logger.add("../test.log")
|
||||
test_channels = [
|
||||
Channel(
|
||||
id=0,
|
||||
name="Logan Williams (test)",
|
||||
platform_id=891729132,
|
||||
name="L Weber (test)",
|
||||
platform_id=1424979017749442595,
|
||||
category="test",
|
||||
followers=None,
|
||||
platform="Twitter",
|
||||
url="https://twitter.com/obtusatum",
|
||||
screenname="obtusatum",
|
||||
url="https://twitter.com/LWeber33662141",
|
||||
screenname="LWeber33662141",
|
||||
country="US",
|
||||
influencer=None,
|
||||
public=True,
|
||||
|
||||
@@ -3,6 +3,7 @@ import pytest
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
from cisticola.scraper import ScraperController
|
||||
from cisticola.transformer import ETLController
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@@ -98,13 +99,13 @@ TELEGRAM_CHANNEL_KWARGS = {
|
||||
|
||||
TWITTER_CHANNEL_KWARGS = {
|
||||
'id': 5,
|
||||
'name': 'Logan Williams (test)',
|
||||
'platform_id': 891729132,
|
||||
'name': 'L Weber (test)',
|
||||
'platform_id': 1424979017749442595,
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Twitter',
|
||||
'url': 'https://twitter.com/obtusatum',
|
||||
'screenname': 'obtusatum',
|
||||
'url': 'https://twitter.com/LWeber33662141',
|
||||
'screenname': 'LWeber33662141',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
@@ -113,35 +114,49 @@ TWITTER_CHANNEL_KWARGS = {
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def controller(tmpdir_factory):
|
||||
|
||||
"""Initialize ScraperController and SQLite database file to be used for all
|
||||
tests in the package.
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def engine(tmpdir_factory):
|
||||
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
|
||||
tests in the package"""
|
||||
|
||||
file = tmpdir_factory.mktemp('test_data').join('test.db')
|
||||
engine = create_engine(f'sqlite:///{file}')
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def controller(engine):
|
||||
"""Initialize ScraperController to be used for all tests in the package."""
|
||||
|
||||
scraper_controller = ScraperController()
|
||||
scraper_controller.connect_to_db(engine)
|
||||
|
||||
return scraper_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def channel_kwargs():
|
||||
def etl_controller(engine):
|
||||
"""Initialize ETLController to be used for all tests in the package."""
|
||||
|
||||
etl_controller = ETLController()
|
||||
etl_controller.connect_to_db(engine)
|
||||
|
||||
return etl_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
def channel_kwargs():
|
||||
"""Define keyword arguments to use for defining test channels for each
|
||||
platform to be scraped.
|
||||
"""
|
||||
|
||||
return {
|
||||
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab' : GAB_CHANNEL_KWARGS,
|
||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS}
|
||||
'bitchute': BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab': GAB_CHANNEL_KWARGS,
|
||||
'gettr': GETTR_CHANNEL_KWARGS,
|
||||
'odysee': ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble': RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram': TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter': TWITTER_CHANNEL_KWARGS}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
0
tests/scraper/__init__.py
Normal file
0
tests/scraper/__init__.py
Normal file
0
tests/transformer/__init__.py
Normal file
0
tests/transformer/__init__.py
Normal file
30
tests/transformer/twitter.py
Normal file
30
tests/transformer/twitter.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
import json
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
from cisticola.transformer import TwitterTransformer
|
||||
from cisticola.base import TransformedResult, Media
|
||||
|
||||
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(TwitterTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(TransformedResult).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 3
|
||||
assert len(media) == 2
|
||||
|
||||
assert posts[-1].content == "This is a test. https://t.co/rzTFL9uFi6"
|
||||
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user