added methods for extracting channel profile metadata, and tests

This commit is contained in:
Tristan Lee
2022-03-28 21:11:34 -05:00
parent ea40ea2640
commit 67d1abf024
14 changed files with 120 additions and 9 deletions

View File

@@ -66,12 +66,13 @@ class BitchuteScraper(Scraper):
def get_profile(self, channel: Channel) -> dict:
base_url = "https://www.bitchute.com/channel/%s/" % channel.url
base_url = channel.url
session = requests.session()
response = session.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
canonical_url = soup.find('link', {'id' : 'canonical'})['href']
csrftoken = session.cookies['csrftoken']
csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']
@@ -84,7 +85,7 @@ class BitchuteScraper(Scraper):
'csrftoken': csrftoken,
'csrfmiddlewaretoken': csrfmiddlewaretoken}
response = session.post(base_url + 'counts/', data = data, headers = headers)
response = session.post(canonical_url + 'counts/', data = data, headers = headers)
counts = json.loads(response.text)
profile = {
@@ -93,9 +94,9 @@ class BitchuteScraper(Scraper):
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : soup.find('p', {'class' : 'owner'}).text,
'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']),
'category' : info_list[-1].text.split('Category')[1].strip(),
'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'],
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
'subscribers': counts['subscriber_count'],
'views': int(counts['about_view_count'].split(' ')[0])}
@@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail):
# these need to be yielded *after* the video because else the result file will have the comments
# before the video, which is weird
yield comment
#-----------------------------------------------------------------------------#
def decode_cfemail(cfemail):
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email
"""
email = ""
k = int(cfemail[:2], 16)
for i in range(2, len(cfemail)-1, 2):
email += chr(int(cfemail[i:i+2], 16)^k)
return email
#---------------------------------------------------------------------------#

View File

@@ -125,14 +125,15 @@ def get_channel_profile(username):
soup = BeautifulSoup(r.content, features = 'lxml')
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
profile = {
'name': soup.find('h1').text,
'verified': verified_svg is not None,
'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'],
'cover': soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'],
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
'cover': cover_soup.get('src') if cover_soup else None,
'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)}
return profile
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper):
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls,
media_archived=archive_media))
media_archived=archive_media)
def get_profile(self, channel: Channel) -> dict:

View File

@@ -11,6 +11,9 @@ addopts =
--cov-report html:reports/coverage
--html='reports/tests.html'
--self-contained-html
markers =
profile: marks tests for only extracting channel metadata (deselect with '-m
"not profile"')
filterwarnings =
ignore:the imp module is deprecated:DeprecationWarning
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import GabScraper
@@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gab'])]
controller.register_scraper(scraper = GabScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gab_profile(channel_kwargs):
scraper = GabScraper()
channel = Channel(**channel_kwargs['gab'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import GettrScraper
@@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['gettr'])]
controller.register_scraper(scraper = GettrScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_gettr_profile(channel_kwargs):
scraper = GettrScraper()
channel = Channel(**channel_kwargs['gettr'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import InstagramScraper
@@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['instagram'])]
controller.register_scraper(scraper = InstagramScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_instagram_profile(channel_kwargs):
scraper = InstagramScraper()
channel = Channel(**channel_kwargs['instagram'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import OdyseeScraper
@@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['odysee'])]
controller.register_scraper(scraper = OdyseeScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_odysee_profile(channel_kwargs):
scraper = OdyseeScraper()
channel = Channel(**channel_kwargs['odysee'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import RumbleScraper
@@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['rumble'])]
controller.register_scraper(scraper = RumbleScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_rumble_profile(channel_kwargs):
scraper = RumbleScraper()
channel = Channel(**channel_kwargs['rumble'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramSnscrapeScraper
@@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramSnscrapeScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_telegram_snscrape_profile(channel_kwargs):
scraper = TelegramSnscrapeScraper()
channel = Channel(**channel_kwargs['telegram'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
@@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_telegram_telethon_profile(channel_kwargs):
scraper = TelegramTelethonScraper()
channel = Channel(**channel_kwargs['telegram'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import TwitterScraper
@@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['twitter'])]
controller.register_scraper(scraper = TwitterScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_twitter_profile(channel_kwargs):
scraper = TwitterScraper()
channel = Channel(**channel_kwargs['twitter'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
@@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_vkontakte_profile(channel_kwargs):
scraper = VkontakteScraper()
channel = Channel(**channel_kwargs['vkontakte'])
scraper.get_profile(channel=channel)

View File

@@ -1,3 +1,5 @@
import pytest
from cisticola.base import Channel
from cisticola.scraper import YoutubeScraper
@@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['youtube'])]
controller.register_scraper(scraper = YoutubeScraper())
controller.scrape_channels(channels = channels, archive_media = True)
@pytest.mark.profile
def test_scrape_youtube_profile(channel_kwargs):
scraper = YoutubeScraper()
channel = Channel(**channel_kwargs['youtube'])
scraper.get_profile(channel=channel)