From 67d1abf024d3d498f9994f9a1a93f820014b8b53 Mon Sep 17 00:00:00 2001
From: Tristan Lee <tristan@bellingcat.com>
Date: Mon, 28 Mar 2022 21:11:34 -0500
Subject: [PATCH] added methods for extracting channel profile metadata, and
 tests

---
 cisticola/scraper/bitchute.py          | 25 +++++++++++++++++++++----
 cisticola/scraper/rumble.py            |  9 +++++----
 cisticola/scraper/telegram_telethon.py |  2 +-
 pytest.ini                             |  3 +++
 tests/scraper/gab.py                   |  9 +++++++++
 tests/scraper/gettr.py                 |  9 +++++++++
 tests/scraper/instagram.py             |  9 +++++++++
 tests/scraper/odysee.py                |  9 +++++++++
 tests/scraper/rumble.py                |  9 +++++++++
 tests/scraper/telegram_snscrape.py     |  9 +++++++++
 tests/scraper/telegram_telethon.py     |  9 +++++++++
 tests/scraper/twitter.py               |  9 +++++++++
 tests/scraper/vkontakte.py             |  9 +++++++++
 tests/scraper/youtube.py               |  9 +++++++++
 14 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py
index a3103a6..d8d3f0b 100644
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -66,12 +66,13 @@ class BitchuteScraper(Scraper):
 
     def get_profile(self, channel: Channel) -> dict:
 
-        base_url = "https://www.bitchute.com/channel/%s/" % channel.url
+        base_url = channel.url
         
         session = requests.session()
         response = session.get(base_url)
         soup = BeautifulSoup(response.content, 'html.parser')
         
+        canonical_url = soup.find('link', {'id' : 'canonical'})['href']
         csrftoken = session.cookies['csrftoken']
         csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']
 
@@ -84,7 +85,7 @@ class BitchuteScraper(Scraper):
             'csrftoken': csrftoken,
             'csrfmiddlewaretoken': csrfmiddlewaretoken}
 
-        response = session.post(base_url + 'counts/', data = data, headers = headers)
+        response = session.post(canonical_url + 'counts/', data = data, headers = headers)
         counts = json.loads(response.text)
 
         profile = {
@@ -93,9 +94,9 @@ class BitchuteScraper(Scraper):
             'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
             'videos' : int(info_list[1].text.split('videos')[0].strip()),
             'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
-            'owner_name' : soup.find('p', {'class' : 'owner'}).text,
+            'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']),
             'category' : info_list[-1].text.split('Category')[1].strip(),
-            'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'],
+            'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
             'subscribers': counts['subscriber_count'],
             'views': int(counts['about_view_count'].split(' ')[0])}
         
@@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail):
                 # these need to be yielded *after* the video because else the result file will have the comments
                 # before the video, which is weird
                 yield comment
+
 #-----------------------------------------------------------------------------#
+
+def decode_cfemail(cfemail):
+    
+    """https://stackoverflow.com/questions/36911296/scraping-of-protected-email
+    """
+    
+    email = ""
+    k = int(cfemail[:2], 16)
+
+    for i in range(2, len(cfemail)-1, 2):
+        email += chr(int(cfemail[i:i+2], 16)^k)
+
+    return email
+
+#---------------------------------------------------------------------------#
\ No newline at end of file
diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py
index 252239e..32e40e8 100644
--- a/cisticola/scraper/rumble.py
+++ b/cisticola/scraper/rumble.py
@@ -125,14 +125,15 @@ def get_channel_profile(username):
     soup = BeautifulSoup(r.content, features = 'lxml')
 
     verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
+    thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
+    cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
 
     profile = {
         'name': soup.find('h1').text,
         'verified': verified_svg is not None,
-        'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'],
-        'cover':  soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'],
-        'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
-
+        'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
+        'cover':  cover_soup.get('src') if cover_soup else None,
+        'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)}
     return profile
 
 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
\ No newline at end of file
diff --git a/cisticola/scraper/telegram_telethon.py b/cisticola/scraper/telegram_telethon.py
index e02ccde..b300551 100644
--- a/cisticola/scraper/telegram_telethon.py
+++ b/cisticola/scraper/telegram_telethon.py
@@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper):
                     date_archived=datetime.now(timezone.utc),
                     raw_data=json.dumps(post.to_dict(), default=str),
                     archived_urls=archived_urls,
-                    media_archived=archive_media))
+                    media_archived=archive_media)
 
     def get_profile(self, channel: Channel) -> dict:
 
diff --git a/pytest.ini b/pytest.ini
index f3545f6..8d9973f 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -11,6 +11,9 @@ addopts =
   --cov-report html:reports/coverage
   --html='reports/tests.html'
   --self-contained-html
+markers = 
+    profile: marks tests for only extracting channel metadata (deselect with '-m 
+    "not profile"')
 filterwarnings =
     ignore:the imp module is deprecated:DeprecationWarning
     ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
diff --git a/tests/scraper/gab.py b/tests/scraper/gab.py
index c864c37..943f40f 100644
--- a/tests/scraper/gab.py
+++ b/tests/scraper/gab.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import GabScraper
 
@@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['gab'])]
     controller.register_scraper(scraper = GabScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_gab_profile(channel_kwargs):
+
+    scraper = GabScraper()
+    channel = Channel(**channel_kwargs['gab'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/gettr.py b/tests/scraper/gettr.py
index 7dd2f24..6a3b70e 100644
--- a/tests/scraper/gettr.py
+++ b/tests/scraper/gettr.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import GettrScraper
 
@@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['gettr'])]
     controller.register_scraper(scraper = GettrScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_gettr_profile(channel_kwargs):
+
+    scraper = GettrScraper()
+    channel = Channel(**channel_kwargs['gettr'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/instagram.py b/tests/scraper/instagram.py
index 0beb546..840d6fa 100644
--- a/tests/scraper/instagram.py
+++ b/tests/scraper/instagram.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import InstagramScraper
 
@@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['instagram'])]
     controller.register_scraper(scraper = InstagramScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_instagram_profile(channel_kwargs):
+
+    scraper = InstagramScraper()
+    channel = Channel(**channel_kwargs['instagram'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/odysee.py b/tests/scraper/odysee.py
index f97700e..8eba07d 100644
--- a/tests/scraper/odysee.py
+++ b/tests/scraper/odysee.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import OdyseeScraper
 
@@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['odysee'])]
     controller.register_scraper(scraper = OdyseeScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_odysee_profile(channel_kwargs):
+
+    scraper = OdyseeScraper()
+    channel = Channel(**channel_kwargs['odysee'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/rumble.py b/tests/scraper/rumble.py
index 5f640e5..f64b24f 100644
--- a/tests/scraper/rumble.py
+++ b/tests/scraper/rumble.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import RumbleScraper
 
@@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['rumble'])]
     controller.register_scraper(scraper = RumbleScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_rumble_profile(channel_kwargs):
+
+    scraper = RumbleScraper()
+    channel = Channel(**channel_kwargs['rumble'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/telegram_snscrape.py b/tests/scraper/telegram_snscrape.py
index 3848780..420b917 100644
--- a/tests/scraper/telegram_snscrape.py
+++ b/tests/scraper/telegram_snscrape.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TelegramSnscrapeScraper
 
@@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['telegram'])]
     controller.register_scraper(scraper = TelegramSnscrapeScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_telegram_snscrape_profile(channel_kwargs):
+
+    scraper = TelegramSnscrapeScraper()
+    channel = Channel(**channel_kwargs['telegram'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/telegram_telethon.py b/tests/scraper/telegram_telethon.py
index c015631..1942fca 100644
--- a/tests/scraper/telegram_telethon.py
+++ b/tests/scraper/telegram_telethon.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TelegramTelethonScraper
 
@@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['telegram'])]
     controller.register_scraper(scraper = TelegramTelethonScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_telegram_telethon_profile(channel_kwargs):
+
+    scraper = TelegramTelethonScraper()
+    channel = Channel(**channel_kwargs['telegram'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/twitter.py b/tests/scraper/twitter.py
index bd79a6a..7512b6a 100644
--- a/tests/scraper/twitter.py
+++ b/tests/scraper/twitter.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import TwitterScraper
 
@@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['twitter'])]
     controller.register_scraper(scraper = TwitterScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_twitter_profile(channel_kwargs):
+
+    scraper = TwitterScraper()
+    channel = Channel(**channel_kwargs['twitter'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py
index ef7cfa1..8b0b757 100644
--- a/tests/scraper/vkontakte.py
+++ b/tests/scraper/vkontakte.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import VkontakteScraper
 
@@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['vkontakte'])]
     controller.register_scraper(scraper = VkontakteScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_vkontakte_profile(channel_kwargs):
+
+    scraper = VkontakteScraper()
+    channel = Channel(**channel_kwargs['vkontakte'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file
diff --git a/tests/scraper/youtube.py b/tests/scraper/youtube.py
index 9d14760..e987cb8 100644
--- a/tests/scraper/youtube.py
+++ b/tests/scraper/youtube.py
@@ -1,3 +1,5 @@
+import pytest
+
 from cisticola.base import Channel
 from cisticola.scraper import YoutubeScraper
 
@@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs):
     channels = [Channel(**channel_kwargs['youtube'])]
     controller.register_scraper(scraper = YoutubeScraper())
     controller.scrape_channels(channels = channels, archive_media = True)
+
+@pytest.mark.profile
+def test_scrape_youtube_profile(channel_kwargs):
+
+    scraper = YoutubeScraper()
+    channel = Channel(**channel_kwargs['youtube'])
+    scraper.get_profile(channel=channel)
\ No newline at end of file