diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index b220550..a841be9 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -496,8 +496,6 @@ class ScraperController: session = self.session() while True: - # # DEBUG - # assert 0 self.archive_unarchived_media_batch(self, session=session, chronological=chronological) @@ -566,6 +564,8 @@ class ScraperController: """Drop all data from the connected SQLAlchemy database. """ + self.session.close_all() + mapper_registry.metadata.drop_all(bind=self.engine) self.connect_to_db(self.engine) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index f5c79fb..87cac2b 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -24,7 +24,8 @@ class BitchuteScraper(Scraper): return username - @logger.catch + # @logger.catch + @logger.catch(reraise = True) def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: session = requests.Session() diff --git a/tests/base.py b/tests/base.py new file mode 100644 index 0000000..50c6723 --- /dev/null +++ b/tests/base.py @@ -0,0 +1,76 @@ +import pytest +from sqlalchemy.sql import text + +from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo +from cisticola.scraper import ( + TelegramTelethonScraper, + BitchuteScraper, + GettrScraper, + RumbleScraper) +from cisticola.transformer import ( + TelegramTelethonTransformer, + BitchuteTransformer, + GettrTransformer, + RumbleTransformer) + +CONTROLLERS = { + 'telegram' : { + 'scraper': TelegramTelethonScraper, + 'transformer': TelegramTelethonTransformer + }, + 'bitchute': { + 'scraper': BitchuteScraper, + 'transformer': BitchuteTransformer + }, + 'gettr': { + 'scraper': GettrScraper, + 'transformer': GettrTransformer + }, + 'rumble': { + 'scraper': RumbleScraper, + 'transformer': RumbleTransformer + } +} + + +@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble']) +def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs): + controller.reset_db() + controller.remove_all_scrapers() + + # necessary for comments/replies to be processed correctly + session.execute(text('INSERT INTO posts(id) VALUES (-1)')) + session.commit() + + channels = [Channel(**channel_kwargs[platform])] + scraper = CONTROLLERS[platform]['scraper'] + controller.register_scraper(scraper = scraper()) + + controller.scrape_channels(channels = channels, archive_media = False) + controller.scrape_all_channel_info() + controller.archive_unarchived_media_batch() + + raw_posts = session.query(ScraperResult).all() + raw_channel_info = session.query(RawChannelInfo).all() + archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first() + + assert len(raw_posts) > 0 + assert len(raw_channel_info) > 0 + assert len(archived_urls) > 0 + + controller.remove_all_scrapers() + + transformer = CONTROLLERS[platform]['transformer'] + + etl_controller.register_transformer(transformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + etl_controller.transform_all_untransformed_media() + + posts = session.query(Post).all() + channel_info = session.query(ChannelInfo).all() + media = session.query(Media).all() + + assert len(posts) > 0 + assert len(channel_info) > 0 + assert len(media) > 0 \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 26f8a94..20ee67d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ import os import pytest from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker from cisticola.scraper import ScraperController from cisticola.transformer import ETLController @@ -105,12 +106,12 @@ RUMBLE_CHANNEL_KWARGS = { 'source': 'researcher'} TELEGRAM_CHANNEL_KWARGS = { - 'name': 'Star Game (test)', - 'platform_id': "-1001866374682", + 'name': 'Бутылка (test)', + 'platform_id': "-1001760492118", 'category': 'test', 'platform': 'Telegram', - 'url': 'https://t.me/stargameinfo', - 'screenname': 'stargameinfo', + 'url': 'https://t.me/butylka1488', + 'screenname': 'butylka1488', 'country': 'RU', 'influencer': None, 'public': True, @@ -170,6 +171,13 @@ def engine(tmpdir_factory): return engine +@pytest.fixture(scope='package') +def session(engine): + """Initialize a SQLAlchemy session to be used for all tests in the package""" + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + return sessionfactory() @pytest.fixture(scope='package') def controller(engine): diff --git a/tests/scraper/__init__.py b/tests/scraper/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/scraper/bitchute.py b/tests/scraper/bitchute.py deleted file mode 100644 index 62b3ffe..0000000 --- a/tests/scraper/bitchute.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import BitchuteScraper - -@pytest.mark.unarchived -def test_scrape_bitchute_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['bitchute'])] - controller.register_scraper(scraper = BitchuteScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_bitchute_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_bitchute_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['bitchute'])] - controller.register_scraper(scraper = BitchuteScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_bitchute_profile(channel_kwargs): - - scraper = BitchuteScraper() - channel = Channel(**channel_kwargs['bitchute']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py deleted file mode 100644 index 79ba8d7..0000000 --- a/tests/scraper/gab.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import GabScraper - -@pytest.mark.unarchived -def test_scrape_gab_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['gab'])] - controller.register_scraper(scraper = GabScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_gab_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_gab_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['gab'])] - controller.register_scraper(scraper = GabScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_gab_profile(channel_kwargs): - - scraper = GabScraper() - channel = Channel(**channel_kwargs['gab']) - scraper.get_profile(channel=channel) - -def test_scrape_gab_group_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['gab_group'])] - controller.register_scraper(scraper = GabScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -def test_scrape_gab_group(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['gab_group'])] - controller.register_scraper(scraper = GabScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_gab_group_profile(channel_kwargs): - - scraper = GabScraper() - channel = Channel(**channel_kwargs['gab_group']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py deleted file mode 100644 index 352e839..0000000 --- a/tests/scraper/gettr.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import GettrScraper - -@pytest.mark.unarchived -def test_scrape_gettr_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['gettr'])] - controller.register_scraper(scraper = GettrScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_gettr_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_gettr_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['gettr'])] - controller.register_scraper(scraper = GettrScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_gettr_profile(channel_kwargs): - - scraper = GettrScraper() - channel = Channel(**channel_kwargs['gettr']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py deleted file mode 100644 index 099ab40..0000000 --- a/tests/scraper/instagram.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import InstagramScraper - -@pytest.mark.unarchived -def test_scrape_instagram_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['instagram'])] - controller.register_scraper(scraper = InstagramScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_instagram_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_instagram_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['instagram'])] - controller.register_scraper(scraper = InstagramScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_instagram_profile(channel_kwargs): - - scraper = InstagramScraper() - channel = Channel(**channel_kwargs['instagram']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py deleted file mode 100644 index 9883bdb..0000000 --- a/tests/scraper/odysee.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import OdyseeScraper - -@pytest.mark.unarchived -def test_scrape_odysee_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['odysee'])] - controller.register_scraper(scraper = OdyseeScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_odysee_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_odysee_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['odysee'])] - controller.register_scraper(scraper = OdyseeScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_odysee_profile(channel_kwargs): - - scraper = OdyseeScraper() - channel = Channel(**channel_kwargs['odysee']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py deleted file mode 100644 index 5b01f9c..0000000 --- a/tests/scraper/rumble.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import RumbleScraper - -@pytest.mark.unarchived -def test_scrape_rumble_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['rumble'])] - controller.register_scraper(scraper = RumbleScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_rumble_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_rumble_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['rumble'])] - controller.register_scraper(scraper = RumbleScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_rumble_profile(channel_kwargs): - - scraper = RumbleScraper() - channel = Channel(**channel_kwargs['rumble']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py deleted file mode 100644 index f1f9be2..0000000 --- a/tests/scraper/telegram_telethon.py +++ /dev/null @@ -1,38 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import TelegramTelethonScraper - -@pytest.mark.unarchived -def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs): - controller.remove_all_scrapers() - - channels = [Channel(**channel_kwargs['telegram'])] - controller.register_scraper(scraper = TelegramTelethonScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_telegram_telethon_unarchived_media(controller): - - controller.archive_unarchived_media_batch() - -@pytest.mark.media -def test_scrape_telegram_telethon_channel(controller, channel_kwargs): - - controller.reset_db() - controller.remove_all_scrapers() - - channels = [Channel(**channel_kwargs['telegram'])] - controller.register_scraper(scraper = TelegramTelethonScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_telegram_telethon_profile(controller, channel_kwargs): - - controller.reset_db() - controller.remove_all_scrapers() - - scraper = TelegramTelethonScraper() - channel = Channel(**channel_kwargs['telegram']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py deleted file mode 100644 index 0a4ad86..0000000 --- a/tests/scraper/twitter.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import TwitterScraper - -@pytest.mark.unarchived -def test_scrape_twitter_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['twitter'])] - controller.register_scraper(scraper = TwitterScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_twitter_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_twitter_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['twitter'])] - controller.register_scraper(scraper = TwitterScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_twitter_profile(channel_kwargs): - - scraper = TwitterScraper() - channel = Channel(**channel_kwargs['twitter']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py deleted file mode 100644 index 12ff12c..0000000 --- a/tests/scraper/vkontakte.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import VkontakteScraper - -@pytest.mark.unarchived -def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['vkontakte'])] - controller.register_scraper(scraper = VkontakteScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_vkontakte_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_vkontakte_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['vkontakte'])] - controller.register_scraper(scraper = VkontakteScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_vkontakte_profile(channel_kwargs): - - scraper = VkontakteScraper() - channel = Channel(**channel_kwargs['vkontakte']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py deleted file mode 100644 index 79ba7c7..0000000 --- a/tests/scraper/youtube.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from cisticola.base import Channel -from cisticola.scraper import YoutubeScraper - -@pytest.mark.unarchived -def test_scrape_youtube_channel_no_media(controller, channel_kwargs): - - channels = [Channel(**channel_kwargs['youtube'])] - controller.register_scraper(scraper = YoutubeScraper()) - controller.scrape_channels(channels = channels, archive_media = False) - -@pytest.mark.media -@pytest.mark.unarchived -def test_scrape_youtube_channel_unarchived_media(controller): - - controller.archive_unarchived_media() - -@pytest.mark.media -def test_scrape_youtube_channel(controller, channel_kwargs): - - controller.reset_db() - - channels = [Channel(**channel_kwargs['youtube'])] - controller.register_scraper(scraper = YoutubeScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - -@pytest.mark.profile -def test_scrape_youtube_profile(channel_kwargs): - - scraper = YoutubeScraper() - channel = Channel(**channel_kwargs['youtube']) - scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/transformer/__init__.py b/tests/transformer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/transformer/bitchute.py b/tests/transformer/bitchute.py deleted file mode 100644 index 126db3a..0000000 --- a/tests/transformer/bitchute.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import BitchuteScraper -from cisticola.transformer import BitchuteTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['bitchute'])] - controller.register_scraper(scraper = BitchuteScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(BitchuteTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 5 - # assert len(media) == 0 - - assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content - # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/gettr.py b/tests/transformer/gettr.py deleted file mode 100644 index 9472f50..0000000 --- a/tests/transformer/gettr.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import GettrScraper -from cisticola.transformer import GettrTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['gettr'])] - controller.register_scraper(scraper = GettrScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(GettrTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 23 - # assert len(media) == 0 - - assert 'Nigerian gender studies' in posts[-1].content - # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/rumble.py b/tests/transformer/rumble.py deleted file mode 100644 index 3b2b8a5..0000000 --- a/tests/transformer/rumble.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import RumbleScraper -from cisticola.transformer import RumbleTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['rumble'])] - controller.register_scraper(scraper = RumbleScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(RumbleTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 7 - # assert len(media) == 0 - - assert '#whitegold #icedoutcuban' in posts[0].content - # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/telegram_telethon.py b/tests/transformer/telegram_telethon.py deleted file mode 100644 index 14fe04c..0000000 --- a/tests/transformer/telegram_telethon.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker, with_polymorphic -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import TelegramTelethonScraper -from cisticola.transformer import TelegramTelethonTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['telegram'])] - controller.register_scraper(scraper = TelegramTelethonScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(TelegramTelethonTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 19 - # assert len(media) == 13 - - assert posts[16].content == "Taking pre-orders now" - # assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280" \ No newline at end of file diff --git a/tests/transformer/twitter.py b/tests/transformer/twitter.py deleted file mode 100644 index 3e4b368..0000000 --- a/tests/transformer/twitter.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import TwitterScraper -from cisticola.transformer import TwitterTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['twitter'])] - controller.register_scraper(scraper = TwitterScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(TwitterTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 12 - assert len(media) == 8 - - assert posts[2].content == "BARN" - assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file diff --git a/tests/transformer/vkontakte.py b/tests/transformer/vkontakte.py deleted file mode 100644 index ea26b62..0000000 --- a/tests/transformer/vkontakte.py +++ /dev/null @@ -1,35 +0,0 @@ -from sqlalchemy.orm import sessionmaker -import json - -import pytest - -from cisticola.base import Channel -from cisticola.scraper import VkontakteScraper -from cisticola.transformer import VkontakteTransformer -from cisticola.base import Post, Media - -@pytest.mark.media -def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs): - controller.reset_db() - - channels = [Channel(**channel_kwargs['vkontakte'])] - controller.register_scraper(scraper = VkontakteScraper()) - controller.scrape_channels(channels = channels, archive_media = True) - controller.scrape_all_channel_info() - - etl_controller.register_transformer(VkontakteTransformer()) - etl_controller.transform_all_untransformed() - etl_controller.transform_all_untransformed_info() - - sessionfactory = sessionmaker() - sessionfactory.configure(bind=engine) - session = sessionfactory() - - posts = session.query(Post).all() - media = session.query(Media).all() - - assert len(posts) == 23 - # assert len(media) == 0 - - assert 'Nigerian gender studies' in posts[-1].content - # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file