From 67d1abf024d3d498f9994f9a1a93f820014b8b53 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Mon, 28 Mar 2022 21:11:34 -0500 Subject: [PATCH] added methods for extracting channel profile metadata, and tests --- cisticola/scraper/bitchute.py | 25 +++++++++++++++++++++---- cisticola/scraper/rumble.py | 9 +++++---- cisticola/scraper/telegram_telethon.py | 2 +- pytest.ini | 3 +++ tests/scraper/gab.py | 9 +++++++++ tests/scraper/gettr.py | 9 +++++++++ tests/scraper/instagram.py | 9 +++++++++ tests/scraper/odysee.py | 9 +++++++++ tests/scraper/rumble.py | 9 +++++++++ tests/scraper/telegram_snscrape.py | 9 +++++++++ tests/scraper/telegram_telethon.py | 9 +++++++++ tests/scraper/twitter.py | 9 +++++++++ tests/scraper/vkontakte.py | 9 +++++++++ tests/scraper/youtube.py | 9 +++++++++ 14 files changed, 120 insertions(+), 9 deletions(-) diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index a3103a6..d8d3f0b 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -66,12 +66,13 @@ class BitchuteScraper(Scraper): def get_profile(self, channel: Channel) -> dict: - base_url = "https://www.bitchute.com/channel/%s/" % channel.url + base_url = channel.url session = requests.session() response = session.get(base_url) soup = BeautifulSoup(response.content, 'html.parser') + canonical_url = soup.find('link', {'id' : 'canonical'})['href'] csrftoken = session.cookies['csrftoken'] csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value'] @@ -84,7 +85,7 @@ class BitchuteScraper(Scraper): 'csrftoken': csrftoken, 'csrfmiddlewaretoken': csrfmiddlewaretoken} - response = session.post(base_url + 'counts/', data = data, headers = headers) + response = session.post(canonical_url + 'counts/', data = data, headers = headers) counts = json.loads(response.text) profile = { @@ -93,9 +94,9 @@ class BitchuteScraper(Scraper): 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], - 'owner_name' : soup.find('p', {'class' : 'owner'}).text, + 'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']), 'category' : info_list[-1].text.split('Category')[1].strip(), - 'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'], + 'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'), 'subscribers': counts['subscriber_count'], 'views': int(counts['about_view_count'].split(' ')[0])} @@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail): # these need to be yielded *after* the video because else the result file will have the comments # before the video, which is weird yield comment + #-----------------------------------------------------------------------------# + +def decode_cfemail(cfemail): + + """https://stackoverflow.com/questions/36911296/scraping-of-protected-email + """ + + email = "" + k = int(cfemail[:2], 16) + + for i in range(2, len(cfemail)-1, 2): + email += chr(int(cfemail[i:i+2], 16)^k) + + return email + +#---------------------------------------------------------------------------# \ No newline at end of file diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index 252239e..32e40e8 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -125,14 +125,15 @@ def get_channel_profile(username): soup = BeautifulSoup(r.content, features = 'lxml') verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'}) + thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'}) + cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'}) profile = { 'name': soup.find('h1').text, 'verified': verified_svg is not None, - 'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'], - 'cover': soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'], - 'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text} - + 'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None, + 'cover': cover_soup.get('src') if cover_soup else None, + 'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)} return profile #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py index e02ccde..b300551 100644 --- a/cisticola/scraper/telegram_telethon.py +++ b/cisticola/scraper/telegram_telethon.py @@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper): date_archived=datetime.now(timezone.utc), raw_data=json.dumps(post.to_dict(), default=str), archived_urls=archived_urls, - media_archived=archive_media)) + media_archived=archive_media) def get_profile(self, channel: Channel) -> dict: diff --git a/pytest.ini b/pytest.ini index f3545f6..8d9973f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -11,6 +11,9 @@ addopts = --cov-report html:reports/coverage --html='reports/tests.html' --self-contained-html +markers = + profile: marks tests for only extracting channel metadata (deselect with '-m + "not profile"') filterwarnings = ignore:the imp module is deprecated:DeprecationWarning ignore:The localize method is no longer necessary, as this time zone supports the fold attribute diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py index c864c37..943f40f 100644 --- a/tests/scraper/gab.py +++ b/tests/scraper/gab.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import GabScraper @@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gab'])] controller.register_scraper(scraper = GabScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_gab_profile(channel_kwargs): + + scraper = GabScraper() + channel = Channel(**channel_kwargs['gab']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py index 7dd2f24..6a3b70e 100644 --- a/tests/scraper/gettr.py +++ b/tests/scraper/gettr.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import GettrScraper @@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['gettr'])] controller.register_scraper(scraper = GettrScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_gettr_profile(channel_kwargs): + + scraper = GettrScraper() + channel = Channel(**channel_kwargs['gettr']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py index 0beb546..840d6fa 100644 --- a/tests/scraper/instagram.py +++ b/tests/scraper/instagram.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import InstagramScraper @@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['instagram'])] controller.register_scraper(scraper = InstagramScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_instagram_profile(channel_kwargs): + + scraper = InstagramScraper() + channel = Channel(**channel_kwargs['instagram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py index f97700e..8eba07d 100644 --- a/tests/scraper/odysee.py +++ b/tests/scraper/odysee.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import OdyseeScraper @@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['odysee'])] controller.register_scraper(scraper = OdyseeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_odysee_profile(channel_kwargs): + + scraper = OdyseeScraper() + channel = Channel(**channel_kwargs['odysee']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py index 5f640e5..f64b24f 100644 --- a/tests/scraper/rumble.py +++ b/tests/scraper/rumble.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import RumbleScraper @@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['rumble'])] controller.register_scraper(scraper = RumbleScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_rumble_profile(channel_kwargs): + + scraper = RumbleScraper() + channel = Channel(**channel_kwargs['rumble']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py index 3848780..420b917 100644 --- a/tests/scraper/telegram_snscrape.py +++ b/tests/scraper/telegram_snscrape.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TelegramSnscrapeScraper @@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramSnscrapeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_telegram_snscrape_profile(channel_kwargs): + + scraper = TelegramSnscrapeScraper() + channel = Channel(**channel_kwargs['telegram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py index c015631..1942fca 100644 --- a/tests/scraper/telegram_telethon.py +++ b/tests/scraper/telegram_telethon.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TelegramTelethonScraper @@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['telegram'])] controller.register_scraper(scraper = TelegramTelethonScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_telegram_telethon_profile(channel_kwargs): + + scraper = TelegramTelethonScraper() + channel = Channel(**channel_kwargs['telegram']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py index bd79a6a..7512b6a 100644 --- a/tests/scraper/twitter.py +++ b/tests/scraper/twitter.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import TwitterScraper @@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['twitter'])] controller.register_scraper(scraper = TwitterScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_twitter_profile(channel_kwargs): + + scraper = TwitterScraper() + channel = Channel(**channel_kwargs['twitter']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py index ef7cfa1..8b0b757 100644 --- a/tests/scraper/vkontakte.py +++ b/tests/scraper/vkontakte.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import VkontakteScraper @@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['vkontakte'])] controller.register_scraper(scraper = VkontakteScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_vkontakte_profile(channel_kwargs): + + scraper = VkontakteScraper() + channel = Channel(**channel_kwargs['vkontakte']) + scraper.get_profile(channel=channel) \ No newline at end of file diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py index 9d14760..e987cb8 100644 --- a/tests/scraper/youtube.py +++ b/tests/scraper/youtube.py @@ -1,3 +1,5 @@ +import pytest + from cisticola.base import Channel from cisticola.scraper import YoutubeScraper @@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs): channels = [Channel(**channel_kwargs['youtube'])] controller.register_scraper(scraper = YoutubeScraper()) controller.scrape_channels(channels = channels, archive_media = True) + +@pytest.mark.profile +def test_scrape_youtube_profile(channel_kwargs): + + scraper = YoutubeScraper() + channel = Channel(**channel_kwargs['youtube']) + scraper.get_profile(channel=channel) \ No newline at end of file