formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue

This commit is contained in:
Tristan Lee
2023-08-04 14:51:00 -05:00
parent 070ee3391d
commit fab65a5d67
25 changed files with 3043 additions and 2176 deletions

View File

@@ -1,58 +1,64 @@
import pytest
from sqlalchemy.sql import text
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
from cisticola.base import (
Post,
Channel,
ChannelInfo,
Media,
ScraperResult,
RawChannelInfo,
)
from cisticola.scraper import (
TelegramTelethonScraper,
BitchuteScraper,
TelegramTelethonScraper,
BitchuteScraper,
GettrScraper,
RumbleScraper)
RumbleScraper,
)
from cisticola.transformer import (
TelegramTelethonTransformer,
BitchuteTransformer,
TelegramTelethonTransformer,
BitchuteTransformer,
GettrTransformer,
RumbleTransformer)
RumbleTransformer,
)
CONTROLLERS = {
'telegram' : {
'scraper': TelegramTelethonScraper,
'transformer': TelegramTelethonTransformer
"telegram": {
"scraper": TelegramTelethonScraper,
"transformer": TelegramTelethonTransformer,
},
'bitchute': {
'scraper': BitchuteScraper,
'transformer': BitchuteTransformer
},
'gettr': {
'scraper': GettrScraper,
'transformer': GettrTransformer
},
'rumble': {
'scraper': RumbleScraper,
'transformer': RumbleTransformer
}
"bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
"gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
"rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
}
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
@pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
def test_scraper_and_transformer(
platform, session, controller, etl_controller, channel_kwargs
):
controller.reset_db()
controller.remove_all_scrapers()
# necessary for comments/replies to be processed correctly
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
session.commit()
channels = [Channel(**channel_kwargs[platform])]
scraper = CONTROLLERS[platform]['scraper']
controller.register_scraper(scraper = scraper())
scraper = CONTROLLERS[platform]["scraper"]
controller.register_scraper(scraper=scraper())
controller.scrape_channels(channels = channels)
controller.scrape_channels(channels=channels)
controller.scrape_all_channel_info()
controller.archive_unarchived_media_batch()
raw_posts = session.query(ScraperResult).all()
raw_channel_info = session.query(RawChannelInfo).all()
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
archived_urls = (
session.query(ScraperResult.archived_urls)
.order_by(ScraperResult.date_archived.desc())
.first()
)
assert len(raw_posts) > 0
assert len(raw_channel_info) > 0
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
controller.remove_all_scrapers()
transformer = CONTROLLERS[platform]['transformer']
transformer = CONTROLLERS[platform]["transformer"]
etl_controller.register_transformer(transformer())
etl_controller.transform_all_untransformed()
@@ -73,4 +79,4 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
assert len(posts) > 0
assert len(channel_info) > 0
assert len(media) > 0
assert len(media) > 0

View File

@@ -8,170 +8,182 @@ from cisticola.scraper import ScraperController
from cisticola.transformer import ETLController
BITCHUTE_CHANNEL_KWARGS = {
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
'platform_id': 'bestonlinejewelrystoresusagmailcom',
'category': 'test',
'platform': 'Bitchute',
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
'screenname': None,
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "bestonlinejewelrystoresusa@gmail.com (test)",
"platform_id": "bestonlinejewelrystoresusagmailcom",
"category": "test",
"platform": "Bitchute",
"url": "https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/",
"screenname": None,
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
GAB_CHANNEL_KWARGS = {
'name': 'Capt. Marc Simon (test)',
'platform_id': 'marc_capt',
'category': 'test',
'platform': 'Gab',
'url': 'https://gab.com/marc_capt',
'screenname': 'marc_capt',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Capt. Marc Simon (test)",
"platform_id": "marc_capt",
"category": "test",
"platform": "Gab",
"url": "https://gab.com/marc_capt",
"screenname": "marc_capt",
"country": "CA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
GAB_GROUP_KWARGS = {
'name': 'iran group (test)',
'platform_id': "10001",
'category': 'test',
'platform': 'Gab',
'url': 'https://gab.com/groups/10001',
'screenname': 'iran group',
'country': 'IR',
'influencer': None,
'public': True,
'chat': True,
'notes': '',
'source': 'researcher'}
"name": "iran group (test)",
"platform_id": "10001",
"category": "test",
"platform": "Gab",
"url": "https://gab.com/groups/10001",
"screenname": "iran group",
"country": "IR",
"influencer": None,
"public": True,
"chat": True,
"notes": "",
"source": "researcher",
}
GETTR_CHANNEL_KWARGS = {
'name': 'LizardRepublic (test)',
'platform_id': 'lizardrepublic',
'category': 'test',
'platform': 'Gettr',
'url': 'https://www.gettr.com/user/lizardrepublic',
'screenname': 'lizardrepublic',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "LizardRepublic (test)",
"platform_id": "lizardrepublic",
"category": "test",
"platform": "Gettr",
"url": "https://www.gettr.com/user/lizardrepublic",
"screenname": "lizardrepublic",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
INSTAGRAM_CHANNEL_KWARGS = {
'name': 'borland.88 (test)',
'platform_id': 'borland.88',
'category': 'test',
'platform': 'Instagram',
'url': 'https://www.instagram.com/borland.88/',
'screenname': 'borland.88',
'country': 'UA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "borland.88 (test)",
"platform_id": "borland.88",
"category": "test",
"platform": "Instagram",
"url": "https://www.instagram.com/borland.88/",
"screenname": "borland.88",
"country": "UA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
ODYSEE_CHANNEL_KWARGS = {
'name': "Mak1n' Bacon (test)",
'platform_id': 'Mak1nBacon',
'category': 'test',
'platform': 'Odysee',
'url': 'https://odysee.com/@Mak1nBacon',
'screenname': 'Mak1nBacon',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Mak1n' Bacon (test)",
"platform_id": "Mak1nBacon",
"category": "test",
"platform": "Odysee",
"url": "https://odysee.com/@Mak1nBacon",
"screenname": "Mak1nBacon",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
RUMBLE_CHANNEL_KWARGS = {
'name': 'we are uploading videos wow products (test)',
'platform_id': 'c-916305',
'category': 'test',
'platform': 'Rumble',
'url': 'https://rumble.com/c/c-916305',
'screenname': 'we are uploading',
'country': 'CA',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "we are uploading videos wow products (test)",
"platform_id": "c-916305",
"category": "test",
"platform": "Rumble",
"url": "https://rumble.com/c/c-916305",
"screenname": "we are uploading",
"country": "CA",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
TELEGRAM_CHANNEL_KWARGS = {
'name': 'Бутылка (test)',
'platform_id': "-1001760492118",
'category': 'test',
'platform': 'Telegram',
'url': 'https://t.me/butylka1488',
'screenname': 'butylka1488',
'country': 'RU',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Бутылка (test)",
"platform_id": "-1001760492118",
"category": "test",
"platform": "Telegram",
"url": "https://t.me/butylka1488",
"screenname": "butylka1488",
"country": "RU",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
TWITTER_CHANNEL_KWARGS = {
'name': 'L Weber (test)',
'platform_id': "1424979017749442595",
'category': 'test',
'platform': 'Twitter',
'url': 'https://twitter.com/LWeber33662141',
'screenname': 'LWeber33662141',
'country': 'US',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "L Weber (test)",
"platform_id": "1424979017749442595",
"category": "test",
"platform": "Twitter",
"url": "https://twitter.com/LWeber33662141",
"screenname": "LWeber33662141",
"country": "US",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
VKONTAKTE_CHANNEL_KWARGS = {
'name': 'Wwg1wgA (test)',
'platform_id': 'club201278078',
'category': 'test',
'platform': 'Vkontakte',
'url': 'https://vk.com/club201278078',
'screenname': 'Wwg1wgA',
'country': 'FR',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "Wwg1wgA (test)",
"platform_id": "club201278078",
"category": "test",
"platform": "Vkontakte",
"url": "https://vk.com/club201278078",
"screenname": "Wwg1wgA",
"country": "FR",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
YOUTUBE_CHANNEL_KWARGS = {
'name': 'AnEs87 (test)',
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
'category': 'test',
'platform': 'Youtube',
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
'screenname': 'AnEs87',
'country': 'SV',
'influencer': None,
'public': True,
'chat': False,
'notes': '',
'source': 'researcher'}
"name": "AnEs87 (test)",
"platform_id": "UCP6exBqGoxGLv_pM9Dxk2pA",
"category": "test",
"platform": "Youtube",
"url": "https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA",
"screenname": "AnEs87",
"country": "SV",
"influencer": None,
"public": True,
"chat": False,
"notes": "",
"source": "researcher",
}
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def engine(tmpdir_factory):
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
tests in the package"""
engine = create_engine(os.environ["TEST_DB"])
return engine
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def session(engine):
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
@@ -179,7 +191,8 @@ def session(engine):
sessionfactory.configure(bind=engine)
return sessionfactory()
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def controller(engine):
"""Initialize ScraperController to be used for all tests in the package."""
@@ -188,7 +201,8 @@ def controller(engine):
return scraper_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def etl_controller(engine):
"""Initialize ETLController to be used for all tests in the package."""
@@ -197,21 +211,23 @@ def etl_controller(engine):
return etl_controller
@pytest.fixture(scope='package')
@pytest.fixture(scope="package")
def channel_kwargs():
"""Define keyword arguments to use for defining test channels for each
"""Define keyword arguments to use for defining test channels for each
platform to be scraped.
"""
return {
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
'gab' : GAB_CHANNEL_KWARGS,
'gab_group' : GAB_GROUP_KWARGS,
'gettr' : GETTR_CHANNEL_KWARGS,
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
'youtube' : YOUTUBE_CHANNEL_KWARGS}
"bitchute": BITCHUTE_CHANNEL_KWARGS,
"gab": GAB_CHANNEL_KWARGS,
"gab_group": GAB_GROUP_KWARGS,
"gettr": GETTR_CHANNEL_KWARGS,
"instagram": INSTAGRAM_CHANNEL_KWARGS,
"odysee": ODYSEE_CHANNEL_KWARGS,
"rumble": RUMBLE_CHANNEL_KWARGS,
"telegram": TELEGRAM_CHANNEL_KWARGS,
"twitter": TWITTER_CHANNEL_KWARGS,
"vkontakte": VKONTAKTE_CHANNEL_KWARGS,
"youtube": YOUTUBE_CHANNEL_KWARGS,
}