mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
added methods for extracting channel profile metadata, and tests
This commit is contained in:
@@ -66,12 +66,13 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
|
||||
base_url = "https://www.bitchute.com/channel/%s/" % channel.url
|
||||
base_url = channel.url
|
||||
|
||||
session = requests.session()
|
||||
response = session.get(base_url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
canonical_url = soup.find('link', {'id' : 'canonical'})['href']
|
||||
csrftoken = session.cookies['csrftoken']
|
||||
csrfmiddlewaretoken = soup.find('input', {'name' : 'csrfmiddlewaretoken'})['value']
|
||||
|
||||
@@ -84,7 +85,7 @@ class BitchuteScraper(Scraper):
|
||||
'csrftoken': csrftoken,
|
||||
'csrfmiddlewaretoken': csrfmiddlewaretoken}
|
||||
|
||||
response = session.post(base_url + 'counts/', data = data, headers = headers)
|
||||
response = session.post(canonical_url + 'counts/', data = data, headers = headers)
|
||||
counts = json.loads(response.text)
|
||||
|
||||
profile = {
|
||||
@@ -93,9 +94,9 @@ class BitchuteScraper(Scraper):
|
||||
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
||||
'owner_name' : soup.find('p', {'class' : 'owner'}).text,
|
||||
'owner_name' : decode_cfemail(soup.find('p', {'class' : 'owner'}).find('span', {'class': "__cf_email__"})['data-cfemail']),
|
||||
'category' : info_list[-1].text.split('Category')[1].strip(),
|
||||
'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src'],
|
||||
'image' : about_soup.find('img', {'alt' : 'Channel Image'}).get('data-src'),
|
||||
'subscribers': counts['subscriber_count'],
|
||||
'views': int(counts['about_view_count'].split(' ')[0])}
|
||||
|
||||
@@ -456,4 +457,20 @@ def get_videos_user(session, user, csrftoken, detail):
|
||||
# these need to be yielded *after* the video because else the result file will have the comments
|
||||
# before the video, which is weird
|
||||
yield comment
|
||||
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
def decode_cfemail(cfemail):
|
||||
|
||||
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email
|
||||
"""
|
||||
|
||||
email = ""
|
||||
k = int(cfemail[:2], 16)
|
||||
|
||||
for i in range(2, len(cfemail)-1, 2):
|
||||
email += chr(int(cfemail[i:i+2], 16)^k)
|
||||
|
||||
return email
|
||||
|
||||
#---------------------------------------------------------------------------#
|
||||
@@ -125,14 +125,15 @@ def get_channel_profile(username):
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
|
||||
verified_svg = soup.find('h1').find('svg', {'class' : 'listing-header--verified'})
|
||||
thumbnail_soup = soup.find('img', {'class' : 'listing-header--thumb'})
|
||||
cover_soup = soup.find('img', {'class' : 'listing-header--backsplash-img'})
|
||||
|
||||
profile = {
|
||||
'name': soup.find('h1').text,
|
||||
'verified': verified_svg is not None,
|
||||
'thumbnail': soup.find('img', {'class' : 'listing-header--thumb'})['src'],
|
||||
'cover': soup.find('img', {'class' : 'listing-header--backsplash-img'})['src'],
|
||||
'subscribers': soup.find('span', {'class' : 'subscribe-button-count'}).text}
|
||||
|
||||
'thumbnail': thumbnail_soup.get('src') if thumbnail_soup else None,
|
||||
'cover': cover_soup.get('src') if cover_soup else None,
|
||||
'subscribers': int(soup.find('span', {'class' : 'subscribe-button-count'}).text)}
|
||||
return profile
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -138,7 +138,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media))
|
||||
media_archived=archive_media)
|
||||
|
||||
def get_profile(self, channel: Channel) -> dict:
|
||||
|
||||
|
||||
@@ -11,6 +11,9 @@ addopts =
|
||||
--cov-report html:reports/coverage
|
||||
--html='reports/tests.html'
|
||||
--self-contained-html
|
||||
markers =
|
||||
profile: marks tests for only extracting channel metadata (deselect with '-m
|
||||
"not profile"')
|
||||
filterwarnings =
|
||||
ignore:the imp module is deprecated:DeprecationWarning
|
||||
ignore:The localize method is no longer necessary, as this time zone supports the fold attribute
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GabScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_gab_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['gab'])]
|
||||
controller.register_scraper(scraper = GabScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_gab_profile(channel_kwargs):
|
||||
|
||||
scraper = GabScraper()
|
||||
channel = Channel(**channel_kwargs['gab'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import GettrScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_gettr_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['gettr'])]
|
||||
controller.register_scraper(scraper = GettrScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_gettr_profile(channel_kwargs):
|
||||
|
||||
scraper = GettrScraper()
|
||||
channel = Channel(**channel_kwargs['gettr'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import InstagramScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_instagram_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['instagram'])]
|
||||
controller.register_scraper(scraper = InstagramScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_instagram_profile(channel_kwargs):
|
||||
|
||||
scraper = InstagramScraper()
|
||||
channel = Channel(**channel_kwargs['instagram'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import OdyseeScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_odysee_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['odysee'])]
|
||||
controller.register_scraper(scraper = OdyseeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_odysee_profile(channel_kwargs):
|
||||
|
||||
scraper = OdyseeScraper()
|
||||
channel = Channel(**channel_kwargs['odysee'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import RumbleScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_rumble_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['rumble'])]
|
||||
controller.register_scraper(scraper = RumbleScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_rumble_profile(channel_kwargs):
|
||||
|
||||
scraper = RumbleScraper()
|
||||
channel = Channel(**channel_kwargs['rumble'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramSnscrapeScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_telegram_snscrape_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramSnscrapeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_telegram_snscrape_profile(channel_kwargs):
|
||||
|
||||
scraper = TelegramSnscrapeScraper()
|
||||
channel = Channel(**channel_kwargs['telegram'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_telegram_telethon_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_telegram_telethon_profile(channel_kwargs):
|
||||
|
||||
scraper = TelegramTelethonScraper()
|
||||
channel = Channel(**channel_kwargs['telegram'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TwitterScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_twitter_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['twitter'])]
|
||||
controller.register_scraper(scraper = TwitterScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_twitter_profile(channel_kwargs):
|
||||
|
||||
scraper = TwitterScraper()
|
||||
channel = Channel(**channel_kwargs['twitter'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import VkontakteScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_vkontakte_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_vkontakte_profile(channel_kwargs):
|
||||
|
||||
scraper = VkontakteScraper()
|
||||
channel = Channel(**channel_kwargs['vkontakte'])
|
||||
scraper.get_profile(channel=channel)
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import YoutubeScraper
|
||||
|
||||
@@ -14,3 +16,10 @@ def test_scrape_youtube_channel(controller, channel_kwargs):
|
||||
channels = [Channel(**channel_kwargs['youtube'])]
|
||||
controller.register_scraper(scraper = YoutubeScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
@pytest.mark.profile
|
||||
def test_scrape_youtube_profile(channel_kwargs):
|
||||
|
||||
scraper = YoutubeScraper()
|
||||
channel = Channel(**channel_kwargs['youtube'])
|
||||
scraper.get_profile(channel=channel)
|
||||
Reference in New Issue
Block a user