refactored tests to reduce redundancy, got tests workig for Telegram, Bitchute, Gettr, and Rumble

This commit is contained in:
Tristan Lee
2023-08-03 00:53:38 -05:00
parent bd67806ed2
commit e2142966e7
22 changed files with 92 additions and 574 deletions

View File

@@ -496,8 +496,6 @@ class ScraperController:
session = self.session()
while True:
# # DEBUG
# assert 0
self.archive_unarchived_media_batch(self, session=session, chronological=chronological)
@@ -566,6 +564,8 @@ class ScraperController:
"""Drop all data from the connected SQLAlchemy database.
"""
self.session.close_all()
mapper_registry.metadata.drop_all(bind=self.engine)
self.connect_to_db(self.engine)

View File

@@ -24,7 +24,8 @@ class BitchuteScraper(Scraper):
return username
@logger.catch
# @logger.catch
@logger.catch(reraise = True)
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
session = requests.Session()

76
tests/base.py Normal file
View File

@@ -0,0 +1,76 @@
import pytest
from sqlalchemy.sql import text
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
from cisticola.scraper import (
TelegramTelethonScraper,
BitchuteScraper,
GettrScraper,
RumbleScraper)
from cisticola.transformer import (
TelegramTelethonTransformer,
BitchuteTransformer,
GettrTransformer,
RumbleTransformer)
CONTROLLERS = {
'telegram' : {
'scraper': TelegramTelethonScraper,
'transformer': TelegramTelethonTransformer
},
'bitchute': {
'scraper': BitchuteScraper,
'transformer': BitchuteTransformer
},
'gettr': {
'scraper': GettrScraper,
'transformer': GettrTransformer
},
'rumble': {
'scraper': RumbleScraper,
'transformer': RumbleTransformer
}
}
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
controller.reset_db()
controller.remove_all_scrapers()
# necessary for comments/replies to be processed correctly
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
session.commit()
channels = [Channel(**channel_kwargs[platform])]
scraper = CONTROLLERS[platform]['scraper']
controller.register_scraper(scraper = scraper())
controller.scrape_channels(channels = channels, archive_media = False)
controller.scrape_all_channel_info()
controller.archive_unarchived_media_batch()
raw_posts = session.query(ScraperResult).all()
raw_channel_info = session.query(RawChannelInfo).all()
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
assert len(raw_posts) > 0
assert len(raw_channel_info) > 0
assert len(archived_urls) > 0
controller.remove_all_scrapers()
transformer = CONTROLLERS[platform]['transformer']
etl_controller.register_transformer(transformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
etl_controller.transform_all_untransformed_media()
posts = session.query(Post).all()
channel_info = session.query(ChannelInfo).all()
media = session.query(Media).all()
assert len(posts) > 0
assert len(channel_info) > 0
assert len(media) > 0

View File

@@ -2,6 +2,7 @@ import os
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController
@@ -105,12 +106,12 @@ RUMBLE_CHANNEL_KWARGS = {
'source': 'researcher'}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'Star Game (test)',
'platform_id': "-1001866374682",
'name': 'Бутылка (test)',
'platform_id': "-1001760492118",
'category': 'test',
'platform': 'Telegram',
'url': 'https://t.me/stargameinfo',
'screenname': 'stargameinfo',
'url': 'https://t.me/butylka1488',
'screenname': 'butylka1488',
'country': 'RU',
'influencer': None,
'public': True,
@@ -170,6 +171,13 @@ def engine(tmpdir_factory):
return engine
@pytest.fixture(scope='package')
def session(engine):
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
return sessionfactory()
@pytest.fixture(scope='package')
def controller(engine):

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
@pytest.mark.unarchived
def test_scrape_bitchute_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_bitchute_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_bitchute_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_bitchute_profile(channel_kwargs):
scraper = BitchuteScraper()
channel = Channel(**channel_kwargs['bitchute'])
scraper.get_profile(channel=channel)

View File

@@ -1,55 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import GabScraper
@pytest.mark.unarchived
def test_scrape_gab_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gab_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gab_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gab_profile(channel_kwargs):
scraper = GabScraper()
channel = Channel(**channel_kwargs['gab'])
scraper.get_profile(channel=channel)
def test_scrape_gab_group_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab_group'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
def test_scrape_gab_group(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gab_group'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gab_group_profile(channel_kwargs):
scraper = GabScraper()
channel = Channel(**channel_kwargs['gab_group'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
@pytest.mark.unarchived
def test_scrape_gettr_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_gettr_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_gettr_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gettr_profile(channel_kwargs):
scraper = GettrScraper()
channel = Channel(**channel_kwargs['gettr'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import InstagramScraper
@pytest.mark.unarchived
def test_scrape_instagram_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_instagram_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_instagram_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_instagram_profile(channel_kwargs):
scraper = InstagramScraper()
channel = Channel(**channel_kwargs['instagram'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
@pytest.mark.unarchived
def test_scrape_odysee_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_odysee_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_odysee_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_odysee_profile(channel_kwargs):
scraper = OdyseeScraper()
channel = Channel(**channel_kwargs['odysee'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
@pytest.mark.unarchived
def test_scrape_rumble_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_rumble_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_rumble_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_rumble_profile(channel_kwargs):
scraper = RumbleScraper()
channel = Channel(**channel_kwargs['rumble'])
scraper.get_profile(channel=channel)

View File

@@ -1,38 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
@pytest.mark.unarchived
def test_scrape_telegram_telethon_channel_no_media(controller, channel_kwargs):
controller.remove_all_scrapers()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_telegram_telethon_unarchived_media(controller):
controller.archive_unarchived_media_batch()
@pytest.mark.media
def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
controller.reset_db()
controller.remove_all_scrapers()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_telegram_telethon_profile(controller, channel_kwargs):
controller.reset_db()
controller.remove_all_scrapers()
scraper = TelegramTelethonScraper()
channel = Channel(**channel_kwargs['telegram'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
@pytest.mark.unarchived
def test_scrape_twitter_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_twitter_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_twitter_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_twitter_profile(channel_kwargs):
scraper = TwitterScraper()
channel = Channel(**channel_kwargs['twitter'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_vkontakte_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_vkontakte_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_vkontakte_profile(channel_kwargs):
scraper = VkontakteScraper()
channel = Channel(**channel_kwargs['vkontakte'])
scraper.get_profile(channel=channel)

View File

@@ -1,33 +0,0 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
@pytest.mark.unarchived
def test_scrape_youtube_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = False)
@pytest.mark.media
@pytest.mark.unarchived
def test_scrape_youtube_channel_unarchived_media(controller):
controller.archive_unarchived_media()
@pytest.mark.media
def test_scrape_youtube_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_youtube_profile(channel_kwargs):
scraper = YoutubeScraper()
channel = Channel(**channel_kwargs['youtube'])
scraper.get_profile(channel=channel)

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
from cisticola.transformer import BitchuteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(BitchuteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 5
# assert len(media) == 0
assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
from cisticola.transformer import GettrTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_gettr(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(GettrTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
from cisticola.transformer import RumbleTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_rumble(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(RumbleTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 7
# assert len(media) == 0
assert '#whitegold #icedoutcuban' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
from cisticola.transformer import TelegramTelethonTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TelegramTelethonTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 19
# assert len(media) == 13
assert posts[16].content == "Taking pre-orders now"
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
from cisticola.transformer import TwitterTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 12
assert len(media) == 8
assert posts[2].content == "BARN"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"

View File

@@ -1,35 +0,0 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
from cisticola.transformer import VkontakteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_vkontakte(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
controller.scrape_all_channel_info()
etl_controller.register_transformer(VkontakteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 23
# assert len(media) == 0
assert 'Nigerian gender studies' in posts[-1].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"