mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
formatted with black, added pre-commit hook, pegged typing_extensions package version to fix spaCy issue
This commit is contained in:
@@ -1,58 +1,64 @@
|
||||
import pytest
|
||||
from sqlalchemy.sql import text
|
||||
|
||||
from cisticola.base import Post, Channel, ChannelInfo, Media, ScraperResult, RawChannelInfo
|
||||
from cisticola.base import (
|
||||
Post,
|
||||
Channel,
|
||||
ChannelInfo,
|
||||
Media,
|
||||
ScraperResult,
|
||||
RawChannelInfo,
|
||||
)
|
||||
from cisticola.scraper import (
|
||||
TelegramTelethonScraper,
|
||||
BitchuteScraper,
|
||||
TelegramTelethonScraper,
|
||||
BitchuteScraper,
|
||||
GettrScraper,
|
||||
RumbleScraper)
|
||||
RumbleScraper,
|
||||
)
|
||||
from cisticola.transformer import (
|
||||
TelegramTelethonTransformer,
|
||||
BitchuteTransformer,
|
||||
TelegramTelethonTransformer,
|
||||
BitchuteTransformer,
|
||||
GettrTransformer,
|
||||
RumbleTransformer)
|
||||
RumbleTransformer,
|
||||
)
|
||||
|
||||
CONTROLLERS = {
|
||||
'telegram' : {
|
||||
'scraper': TelegramTelethonScraper,
|
||||
'transformer': TelegramTelethonTransformer
|
||||
"telegram": {
|
||||
"scraper": TelegramTelethonScraper,
|
||||
"transformer": TelegramTelethonTransformer,
|
||||
},
|
||||
'bitchute': {
|
||||
'scraper': BitchuteScraper,
|
||||
'transformer': BitchuteTransformer
|
||||
},
|
||||
'gettr': {
|
||||
'scraper': GettrScraper,
|
||||
'transformer': GettrTransformer
|
||||
},
|
||||
'rumble': {
|
||||
'scraper': RumbleScraper,
|
||||
'transformer': RumbleTransformer
|
||||
}
|
||||
"bitchute": {"scraper": BitchuteScraper, "transformer": BitchuteTransformer},
|
||||
"gettr": {"scraper": GettrScraper, "transformer": GettrTransformer},
|
||||
"rumble": {"scraper": RumbleScraper, "transformer": RumbleTransformer},
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize('platform', ['telegram','bitchute', 'gettr', 'rumble'])
|
||||
def test_scraper_and_transformer(platform, session, controller, etl_controller, channel_kwargs):
|
||||
|
||||
@pytest.mark.parametrize("platform", ["telegram", "bitchute", "gettr", "rumble"])
|
||||
def test_scraper_and_transformer(
|
||||
platform, session, controller, etl_controller, channel_kwargs
|
||||
):
|
||||
controller.reset_db()
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
# necessary for comments/replies to be processed correctly
|
||||
session.execute(text('INSERT INTO posts(id) VALUES (-1)'))
|
||||
session.execute(text("INSERT INTO posts(id) VALUES (-1)"))
|
||||
session.commit()
|
||||
|
||||
channels = [Channel(**channel_kwargs[platform])]
|
||||
scraper = CONTROLLERS[platform]['scraper']
|
||||
controller.register_scraper(scraper = scraper())
|
||||
scraper = CONTROLLERS[platform]["scraper"]
|
||||
controller.register_scraper(scraper=scraper())
|
||||
|
||||
controller.scrape_channels(channels = channels)
|
||||
controller.scrape_channels(channels=channels)
|
||||
controller.scrape_all_channel_info()
|
||||
controller.archive_unarchived_media_batch()
|
||||
|
||||
raw_posts = session.query(ScraperResult).all()
|
||||
raw_channel_info = session.query(RawChannelInfo).all()
|
||||
archived_urls = session.query(ScraperResult.archived_urls).order_by(ScraperResult.date_archived.desc()).first()
|
||||
archived_urls = (
|
||||
session.query(ScraperResult.archived_urls)
|
||||
.order_by(ScraperResult.date_archived.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
assert len(raw_posts) > 0
|
||||
assert len(raw_channel_info) > 0
|
||||
@@ -60,7 +66,7 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
|
||||
|
||||
controller.remove_all_scrapers()
|
||||
|
||||
transformer = CONTROLLERS[platform]['transformer']
|
||||
transformer = CONTROLLERS[platform]["transformer"]
|
||||
|
||||
etl_controller.register_transformer(transformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
@@ -73,4 +79,4 @@ def test_scraper_and_transformer(platform, session, controller, etl_controller,
|
||||
|
||||
assert len(posts) > 0
|
||||
assert len(channel_info) > 0
|
||||
assert len(media) > 0
|
||||
assert len(media) > 0
|
||||
|
||||
@@ -8,170 +8,182 @@ from cisticola.scraper import ScraperController
|
||||
from cisticola.transformer import ETLController
|
||||
|
||||
BITCHUTE_CHANNEL_KWARGS = {
|
||||
'name': 'bestonlinejewelrystoresusa@gmail.com (test)',
|
||||
'platform_id': 'bestonlinejewelrystoresusagmailcom',
|
||||
'category': 'test',
|
||||
'platform': 'Bitchute',
|
||||
'url': 'https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/',
|
||||
'screenname': None,
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "bestonlinejewelrystoresusa@gmail.com (test)",
|
||||
"platform_id": "bestonlinejewelrystoresusagmailcom",
|
||||
"category": "test",
|
||||
"platform": "Bitchute",
|
||||
"url": "https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/",
|
||||
"screenname": None,
|
||||
"country": "US",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
GAB_CHANNEL_KWARGS = {
|
||||
'name': 'Capt. Marc Simon (test)',
|
||||
'platform_id': 'marc_capt',
|
||||
'category': 'test',
|
||||
'platform': 'Gab',
|
||||
'url': 'https://gab.com/marc_capt',
|
||||
'screenname': 'marc_capt',
|
||||
'country': 'CA',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "Capt. Marc Simon (test)",
|
||||
"platform_id": "marc_capt",
|
||||
"category": "test",
|
||||
"platform": "Gab",
|
||||
"url": "https://gab.com/marc_capt",
|
||||
"screenname": "marc_capt",
|
||||
"country": "CA",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
GAB_GROUP_KWARGS = {
|
||||
'name': 'iran group (test)',
|
||||
'platform_id': "10001",
|
||||
'category': 'test',
|
||||
'platform': 'Gab',
|
||||
'url': 'https://gab.com/groups/10001',
|
||||
'screenname': 'iran group',
|
||||
'country': 'IR',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': True,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "iran group (test)",
|
||||
"platform_id": "10001",
|
||||
"category": "test",
|
||||
"platform": "Gab",
|
||||
"url": "https://gab.com/groups/10001",
|
||||
"screenname": "iran group",
|
||||
"country": "IR",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": True,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
GETTR_CHANNEL_KWARGS = {
|
||||
'name': 'LizardRepublic (test)',
|
||||
'platform_id': 'lizardrepublic',
|
||||
'category': 'test',
|
||||
'platform': 'Gettr',
|
||||
'url': 'https://www.gettr.com/user/lizardrepublic',
|
||||
'screenname': 'lizardrepublic',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "LizardRepublic (test)",
|
||||
"platform_id": "lizardrepublic",
|
||||
"category": "test",
|
||||
"platform": "Gettr",
|
||||
"url": "https://www.gettr.com/user/lizardrepublic",
|
||||
"screenname": "lizardrepublic",
|
||||
"country": "US",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
INSTAGRAM_CHANNEL_KWARGS = {
|
||||
'name': 'borland.88 (test)',
|
||||
'platform_id': 'borland.88',
|
||||
'category': 'test',
|
||||
'platform': 'Instagram',
|
||||
'url': 'https://www.instagram.com/borland.88/',
|
||||
'screenname': 'borland.88',
|
||||
'country': 'UA',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "borland.88 (test)",
|
||||
"platform_id": "borland.88",
|
||||
"category": "test",
|
||||
"platform": "Instagram",
|
||||
"url": "https://www.instagram.com/borland.88/",
|
||||
"screenname": "borland.88",
|
||||
"country": "UA",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
ODYSEE_CHANNEL_KWARGS = {
|
||||
'name': "Mak1n' Bacon (test)",
|
||||
'platform_id': 'Mak1nBacon',
|
||||
'category': 'test',
|
||||
'platform': 'Odysee',
|
||||
'url': 'https://odysee.com/@Mak1nBacon',
|
||||
'screenname': 'Mak1nBacon',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "Mak1n' Bacon (test)",
|
||||
"platform_id": "Mak1nBacon",
|
||||
"category": "test",
|
||||
"platform": "Odysee",
|
||||
"url": "https://odysee.com/@Mak1nBacon",
|
||||
"screenname": "Mak1nBacon",
|
||||
"country": "US",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
RUMBLE_CHANNEL_KWARGS = {
|
||||
'name': 'we are uploading videos wow products (test)',
|
||||
'platform_id': 'c-916305',
|
||||
'category': 'test',
|
||||
'platform': 'Rumble',
|
||||
'url': 'https://rumble.com/c/c-916305',
|
||||
'screenname': 'we are uploading',
|
||||
'country': 'CA',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "we are uploading videos wow products (test)",
|
||||
"platform_id": "c-916305",
|
||||
"category": "test",
|
||||
"platform": "Rumble",
|
||||
"url": "https://rumble.com/c/c-916305",
|
||||
"screenname": "we are uploading",
|
||||
"country": "CA",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
TELEGRAM_CHANNEL_KWARGS = {
|
||||
'name': 'Бутылка (test)',
|
||||
'platform_id': "-1001760492118",
|
||||
'category': 'test',
|
||||
'platform': 'Telegram',
|
||||
'url': 'https://t.me/butylka1488',
|
||||
'screenname': 'butylka1488',
|
||||
'country': 'RU',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
|
||||
"name": "Бутылка (test)",
|
||||
"platform_id": "-1001760492118",
|
||||
"category": "test",
|
||||
"platform": "Telegram",
|
||||
"url": "https://t.me/butylka1488",
|
||||
"screenname": "butylka1488",
|
||||
"country": "RU",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
TWITTER_CHANNEL_KWARGS = {
|
||||
'name': 'L Weber (test)',
|
||||
'platform_id': "1424979017749442595",
|
||||
'category': 'test',
|
||||
'platform': 'Twitter',
|
||||
'url': 'https://twitter.com/LWeber33662141',
|
||||
'screenname': 'LWeber33662141',
|
||||
'country': 'US',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "L Weber (test)",
|
||||
"platform_id": "1424979017749442595",
|
||||
"category": "test",
|
||||
"platform": "Twitter",
|
||||
"url": "https://twitter.com/LWeber33662141",
|
||||
"screenname": "LWeber33662141",
|
||||
"country": "US",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
VKONTAKTE_CHANNEL_KWARGS = {
|
||||
'name': 'Wwg1wgA (test)',
|
||||
'platform_id': 'club201278078',
|
||||
'category': 'test',
|
||||
'platform': 'Vkontakte',
|
||||
'url': 'https://vk.com/club201278078',
|
||||
'screenname': 'Wwg1wgA',
|
||||
'country': 'FR',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "Wwg1wgA (test)",
|
||||
"platform_id": "club201278078",
|
||||
"category": "test",
|
||||
"platform": "Vkontakte",
|
||||
"url": "https://vk.com/club201278078",
|
||||
"screenname": "Wwg1wgA",
|
||||
"country": "FR",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
YOUTUBE_CHANNEL_KWARGS = {
|
||||
'name': 'AnEs87 (test)',
|
||||
'platform_id': 'UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'category': 'test',
|
||||
'platform': 'Youtube',
|
||||
'url': 'https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA',
|
||||
'screenname': 'AnEs87',
|
||||
'country': 'SV',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': '',
|
||||
'source': 'researcher'}
|
||||
"name": "AnEs87 (test)",
|
||||
"platform_id": "UCP6exBqGoxGLv_pM9Dxk2pA",
|
||||
"category": "test",
|
||||
"platform": "Youtube",
|
||||
"url": "https://www.youtube.com/channel/UCP6exBqGoxGLv_pM9Dxk2pA",
|
||||
"screenname": "AnEs87",
|
||||
"country": "SV",
|
||||
"influencer": None,
|
||||
"public": True,
|
||||
"chat": False,
|
||||
"notes": "",
|
||||
"source": "researcher",
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
@pytest.fixture(scope="package")
|
||||
def engine(tmpdir_factory):
|
||||
"""Initialize a SQLite database and SQLAlchemy engine to be used for all
|
||||
tests in the package"""
|
||||
|
||||
engine = create_engine(os.environ["TEST_DB"])
|
||||
|
||||
|
||||
return engine
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
|
||||
@pytest.fixture(scope="package")
|
||||
def session(engine):
|
||||
"""Initialize a SQLAlchemy session to be used for all tests in the package"""
|
||||
|
||||
@@ -179,7 +191,8 @@ def session(engine):
|
||||
sessionfactory.configure(bind=engine)
|
||||
return sessionfactory()
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
|
||||
@pytest.fixture(scope="package")
|
||||
def controller(engine):
|
||||
"""Initialize ScraperController to be used for all tests in the package."""
|
||||
|
||||
@@ -188,7 +201,8 @@ def controller(engine):
|
||||
|
||||
return scraper_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
|
||||
@pytest.fixture(scope="package")
|
||||
def etl_controller(engine):
|
||||
"""Initialize ETLController to be used for all tests in the package."""
|
||||
|
||||
@@ -197,21 +211,23 @@ def etl_controller(engine):
|
||||
|
||||
return etl_controller
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
|
||||
@pytest.fixture(scope="package")
|
||||
def channel_kwargs():
|
||||
"""Define keyword arguments to use for defining test channels for each
|
||||
"""Define keyword arguments to use for defining test channels for each
|
||||
platform to be scraped.
|
||||
"""
|
||||
|
||||
return {
|
||||
'bitchute' : BITCHUTE_CHANNEL_KWARGS,
|
||||
'gab' : GAB_CHANNEL_KWARGS,
|
||||
'gab_group' : GAB_GROUP_KWARGS,
|
||||
'gettr' : GETTR_CHANNEL_KWARGS,
|
||||
'instagram' : INSTAGRAM_CHANNEL_KWARGS,
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS,
|
||||
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS,
|
||||
'youtube' : YOUTUBE_CHANNEL_KWARGS}
|
||||
"bitchute": BITCHUTE_CHANNEL_KWARGS,
|
||||
"gab": GAB_CHANNEL_KWARGS,
|
||||
"gab_group": GAB_GROUP_KWARGS,
|
||||
"gettr": GETTR_CHANNEL_KWARGS,
|
||||
"instagram": INSTAGRAM_CHANNEL_KWARGS,
|
||||
"odysee": ODYSEE_CHANNEL_KWARGS,
|
||||
"rumble": RUMBLE_CHANNEL_KWARGS,
|
||||
"telegram": TELEGRAM_CHANNEL_KWARGS,
|
||||
"twitter": TWITTER_CHANNEL_KWARGS,
|
||||
"vkontakte": VKONTAKTE_CHANNEL_KWARGS,
|
||||
"youtube": YOUTUBE_CHANNEL_KWARGS,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user