formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue

This commit is contained in:
Tristan Lee
2023-08-04 14:51:00 -05:00
parent 070ee3391d
commit fab65a5d67
25 changed files with 3043 additions and 2176 deletions

View File

@@ -1,58 +1,64 @@
import pytest
from sqlalchemy.sql import text
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
from cisticola.base import (
Post,
Channel,
ChannelInfo,
Media,
ScraperResult,
RawChannelInfo,
)
from cisticola.scraper import (
TelegramTelethonScraper,
BitchuteScraper,
TelegramTelethonScraper,
BitchuteScraper,
GettrScraper,
RumbleScraper)
RumbleScraper,
)
from cisticola.transformer import (
TelegramTelethonTransformer,
BitchuteTransformer,
TelegramTelethonTransformer,
BitchuteTransformer,
GettrTransformer,
RumbleTransformer)
RumbleTransformer,
)
CONTROLLERS = {
'telegram' : {
'scraper': TelegramTelethonScraper,
'transformer': TelegramTelethonTransformer
"telegram": {
"scraper": TelegramTelethonScraper,
"transformer": TelegramTelethonTransformer,
},
'bitchute': {
'scraper': BitchuteScraper,
'transformer': BitchuteTransformer
},
'gettr': {
'scraper': GettrScraper,
'transformer': GettrTransformer
},
'rumble': {
'scraper': RumbleScraper,
'transformer': RumbleTransformer
}
"bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
"gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
"rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
}
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
@pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
def test_scraper_and_transformer(
platform, session, controller, etl_controller, channel_kwargs
):
controller.reset_db()
controller.remove_all_scrapers()
# necessary for comments/replies to be processed correctly
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
session.commit()
channels = [Channel(**channel_kwargs[platform])]
scraper = CONTROLLERS[platform]['scraper']
controller.register_scraper(scraper = scraper())
scraper = CONTROLLERS[platform]["scraper"]
controller.register_scraper(scraper=scraper())
controller.scrape_channels(channels = channels)
controller.scrape_channels(channels=channels)
controller.scrape_all_channel_info()
controller.archive_unarchived_media_batch()
raw_posts = session.query(ScraperResult).all()
raw_channel_info = session.query(RawChannelInfo).all()
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
archived_urls = (
session.query(ScraperResult.archived_urls)
.order_by(ScraperResult.date_archived.desc())
.first()
)
assert len(raw_posts) > 0
assert len(raw_channel_info) > 0
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
controller.remove_all_scrapers()
transformer = CONTROLLERS[platform]['transformer']
transformer = CONTROLLERS[platform]["transformer"]
etl_controller.register_transformer(transformer())
etl_controller.transform_all_untransformed()
@@ -73,4 +79,4 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
assert len(posts) > 0
assert len(channel_info) > 0
assert len(media) > 0
assert len(media) > 0