added methods for extracting channel profile metadata, and tests

2026-06-08 03:18:34 +03:00 · 2022-03-28 21:11:34 -05:00
parent ea40ea2640
commit 67d1abf024
14 changed files with 120 additions and 9 deletions
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -66,12 +66,13 @@ class BitchuteScraper(Scraper):

    def get_profile(self, channel: Channel) -> dict:

-        base_url = "https://www.bitchute.com/channel/%s/" % channel.url
+        base_url = channel.url
        
        session = requests.session()
        response = session.get(base_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
+        canonical_url = soup.find('link', {'id' : 'canonical'})['href']
        csrftoken = session.cookies['csrftoken']
        csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']

@@ -84,7 +85,7 @@ class BitchuteScraper(Scraper):
            'csrftoken': csrftoken,
            'csrfmiddlewaretoken': csrfmiddlewaretoken}

-        response = session.post(base_url + 'counts/', data = data, headers = headers)
+        response = session.post(canonical_url + 'counts/', data = data, headers = headers)
        counts = json.loads(response.text)

        profile = {
@@ -93,9 +94,9 @@ class BitchuteScraper(Scraper):
            'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
            'videos' : int(info_list[1].text.split('videos')[0].strip()),
            'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
-            'owner_name' : soup.find('p', {'class' : 'owner'}).text,
+            'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']),
            'category' : info_list[-1].text.split('Category')[1].strip(),
-            'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'],
+            'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
            'subscribers': counts['subscriber_count'],
            'views': int(counts['about_view_count'].split(' ')[0])}
        
@@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail):
                # these need to be yielded *after* the video because else the result file will have the comments
                # before the video, which is weird
                yield comment
+
 #-----------------------------------------------------------------------------#
+
+def decode_cfemail(cfemail):
+    
+    """https://stackoverflow.com/questions/36911296/scraping-of-protected-email
+    """
+    
+    email = ""
+    k = int(cfemail[:2], 16)
+
+    for i in range(2, len(cfemail)-1, 2):
+        email += chr(int(cfemail[i:i+2], 16)^k)
+
+    return email
+
+#---------------------------------------------------------------------------#
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -125,14 +125,15 @@ def get_channel_profile(username):
    soup = BeautifulSoup(r.content, features = 'lxml')

    verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
+    thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
+    cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})

    profile = {
        'name': soup.find('h1').text,
        'verified': verified_svg is not None,
-        'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'],
-        'cover':  soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'],
-        'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
-
+        'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
+        'cover':  cover_soup.get('src') if cover_soup else None,
+        'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)}
    return profile

 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper):
                    date_archived=datetime.now(timezone.utc),
                    raw_data=json.dumps(post.to_dict(), default=str),
                    archived_urls=archived_urls,
-                    media_archived=archive_media))
+                    media_archived=archive_media)

    def get_profile(self, channel: Channel) -> dict:

--- a/pytest.ini
+++ b/pytest.ini
@@ -11,6 +11,9 @@ addopts =
  --cov-report html:reports/coverage
  --html='reports/tests.html'
  --self-contained-html
+markers = 
+    profile: marks tests for only extracting channel metadata (deselect with '-m 
+    "not profile"')
 filterwarnings =
    ignore:the imp module is deprecated:DeprecationWarning
    ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
--- a/tests/scraper/gab.py
+++ b/tests/scraper/gab.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import GabScraper

@@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['gab'])]
    controller.register_scraper(scraper = GabScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_gab_profile(channel_kwargs):
+
+    scraper = GabScraper()
+    channel = Channel(**channel_kwargs['gab'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/gettr.py
+++ b/tests/scraper/gettr.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import GettrScraper

@@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['gettr'])]
    controller.register_scraper(scraper = GettrScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_gettr_profile(channel_kwargs):
+
+    scraper = GettrScraper()
+    channel = Channel(**channel_kwargs['gettr'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/instagram.py
+++ b/tests/scraper/instagram.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import InstagramScraper

@@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['instagram'])]
    controller.register_scraper(scraper = InstagramScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_instagram_profile(channel_kwargs):
+
+    scraper = InstagramScraper()
+    channel = Channel(**channel_kwargs['instagram'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/odysee.py
+++ b/tests/scraper/odysee.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import OdyseeScraper

@@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['odysee'])]
    controller.register_scraper(scraper = OdyseeScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_odysee_profile(channel_kwargs):
+
+    scraper = OdyseeScraper()
+    channel = Channel(**channel_kwargs['odysee'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/rumble.py
+++ b/tests/scraper/rumble.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import RumbleScraper

@@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['rumble'])]
    controller.register_scraper(scraper = RumbleScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_rumble_profile(channel_kwargs):
+
+    scraper = RumbleScraper()
+    channel = Channel(**channel_kwargs['rumble'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/telegram_snscrape.py
+++ b/tests/scraper/telegram_snscrape.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TelegramSnscrapeScraper

@@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['telegram'])]
    controller.register_scraper(scraper = TelegramSnscrapeScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_telegram_snscrape_profile(channel_kwargs):
+
+    scraper = TelegramSnscrapeScraper()
+    channel = Channel(**channel_kwargs['telegram'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/telegram_telethon.py
+++ b/tests/scraper/telegram_telethon.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TelegramTelethonScraper

@@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['telegram'])]
    controller.register_scraper(scraper = TelegramTelethonScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_telegram_telethon_profile(channel_kwargs):
+
+    scraper = TelegramTelethonScraper()
+    channel = Channel(**channel_kwargs['telegram'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/twitter.py
+++ b/tests/scraper/twitter.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TwitterScraper

@@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['twitter'])]
    controller.register_scraper(scraper = TwitterScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_twitter_profile(channel_kwargs):
+
+    scraper = TwitterScraper()
+    channel = Channel(**channel_kwargs['twitter'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/vkontakte.py
+++ b/tests/scraper/vkontakte.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import VkontakteScraper

@@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['vkontakte'])]
    controller.register_scraper(scraper = VkontakteScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_vkontakte_profile(channel_kwargs):
+
+    scraper = VkontakteScraper()
+    channel = Channel(**channel_kwargs['vkontakte'])
+    scraper.get_profile(channel=channel)
--- a/tests/scraper/youtube.py
+++ b/tests/scraper/youtube.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import YoutubeScraper

@@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs):
    channels = [Channel(**channel_kwargs['youtube'])]
    controller.register_scraper(scraper = YoutubeScraper())
    controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_youtube_profile(channel_kwargs):
+
+    scraper = YoutubeScraper()
+    channel = Channel(**channel_kwargs['youtube'])
+    scraper.get_profile(channel=channel)