mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-07 19:08:35 +03:00
548 lines
19 KiB
Python
548 lines
19 KiB
Python
import json
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from html.parser import HTMLParser
|
|
from typing import Generator, Optional
|
|
|
|
import dateparser
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from loguru import logger
|
|
|
|
from cisticola.base import Channel, RawChannelInfo, ScraperResult
|
|
from cisticola.scraper.base import Scraper
|
|
|
|
|
|
class BitchuteScraper(Scraper):
|
|
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
|
library"""
|
|
|
|
__version__ = "BitchuteScraper 0.0.1"
|
|
|
|
def get_username_from_url(self, url):
|
|
username = url.split("bitchute.com/channel/")[-1].strip("/")
|
|
|
|
return username
|
|
|
|
@logger.catch
|
|
def get_posts(
|
|
self, channel: Channel, since: Optional[ScraperResult] = None
|
|
) -> Generator[ScraperResult, None, None]:
|
|
session = requests.Session()
|
|
session.headers.update(self.headers)
|
|
request = session.get("https://www.bitchute.com/search")
|
|
csrftoken = (
|
|
BeautifulSoup(request.text, "html.parser")
|
|
.findAll("input", {"name": "csrfmiddlewaretoken"})[0]
|
|
.get("value")
|
|
)
|
|
time.sleep(0.25)
|
|
|
|
detail = "comments"
|
|
|
|
username = self.get_username_from_url(channel.url)
|
|
scraper = get_videos_user(session, username, csrftoken, detail)
|
|
|
|
for post in scraper:
|
|
if (
|
|
since is not None
|
|
and datetime.fromtimestamp(post["timestamp"]) <= since.date
|
|
):
|
|
break
|
|
|
|
archived_urls = {}
|
|
|
|
if "video_url" in post:
|
|
url = post["video_url"]
|
|
archived_urls[url] = None
|
|
|
|
yield ScraperResult(
|
|
scraper=self.__version__,
|
|
platform="Bitchute",
|
|
channel=channel.id,
|
|
platform_id=post["id"],
|
|
date=datetime.fromtimestamp(post["timestamp"]),
|
|
date_archived=datetime.now(timezone.utc),
|
|
raw_data=json.dumps(post),
|
|
archived_urls=archived_urls,
|
|
media_archived=None,
|
|
)
|
|
|
|
def can_handle(self, channel):
|
|
if (
|
|
channel.platform == "Bitchute"
|
|
and self.get_username_from_url(channel.url) is not None
|
|
):
|
|
return True
|
|
|
|
@logger.catch
|
|
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
|
base_url = channel.url
|
|
|
|
session = requests.session()
|
|
response = session.get(base_url)
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
canonical_url = soup.find("link", {"id": "canonical"})["href"]
|
|
csrftoken = session.cookies["csrftoken"]
|
|
csrfmiddlewaretoken = soup.find("input", {"name": "csrfmiddlewaretoken"})[
|
|
"value"
|
|
]
|
|
|
|
about_soup = soup.find("div", {"id": "channel-about"})
|
|
info_list = about_soup.find("div", {"class": "channel-about-details"}).find_all(
|
|
"p"
|
|
)
|
|
description_soup = about_soup.find("div", {"id": "channel-description"})
|
|
|
|
headers = {"Referer": base_url}
|
|
data = {"csrftoken": csrftoken, "csrfmiddlewaretoken": csrfmiddlewaretoken}
|
|
|
|
response = session.post(canonical_url + "counts/", data=data, headers=headers)
|
|
counts = json.loads(response.text)
|
|
|
|
owner_soup = soup.find("p", {"class": "owner"})
|
|
if owner_soup.text == "[email\xa0protected]":
|
|
owner_name = decode_cfemail(
|
|
owner_soup.find("span", {"class": "__cf_email__"})["data-cfemail"]
|
|
)
|
|
else:
|
|
owner_name = owner_soup.text
|
|
|
|
profile = {
|
|
"description": description_soup.text.strip(),
|
|
"description_links": [
|
|
a["href"] for a in description_soup.find_all("a", href=True)
|
|
],
|
|
"created": re.sub(
|
|
r"\s", " ", info_list[0].text.split("Created")[1].strip(". ")
|
|
),
|
|
"videos": int(info_list[1].text.split("videos")[0].strip()),
|
|
"owner_url": soup.find("p", {"class": "owner"}).find("a", href=True)[
|
|
"href"
|
|
],
|
|
"owner_name": owner_name,
|
|
"image": about_soup.find("img", {"alt": "Channel Image"}).get("data-src"),
|
|
"subscribers": counts["subscriber_count"],
|
|
"views": int(counts["about_view_count"].split(" ")[0]),
|
|
}
|
|
|
|
return RawChannelInfo(
|
|
scraper=self.__version__,
|
|
platform=channel.platform,
|
|
channel=channel.id,
|
|
raw_data=json.dumps(profile, default=str),
|
|
date_archived=datetime.now(timezone.utc),
|
|
)
|
|
|
|
|
|
def strip_tags(html, convert_newlines=True):
|
|
r"""
|
|
Strip HTML from a string
|
|
|
|
:param html: HTML to strip
|
|
:param convert_newlines: Convert <br> and </p> tags to \n before stripping
|
|
:return: Stripped HTML
|
|
"""
|
|
if not html:
|
|
return ""
|
|
|
|
deduplicate_newlines = re.compile(r"\n+")
|
|
|
|
if convert_newlines:
|
|
html = html.replace("<br>", "\n").replace("</p>", "</p>\n")
|
|
html = deduplicate_newlines.sub("\n", html)
|
|
|
|
class HTMLStripper(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.reset()
|
|
self.strict = False
|
|
self.convert_charrefs = True
|
|
self.fed = []
|
|
|
|
def handle_data(self, data):
|
|
self.fed.append(data)
|
|
|
|
def get_data(self):
|
|
return "".join(self.fed)
|
|
|
|
stripper = HTMLStripper()
|
|
stripper.feed(html)
|
|
return stripper.get_data()
|
|
|
|
|
|
def request_from_bitchute(session, method, url, headers=None, data=None):
|
|
"""
|
|
Request something via the BitChute API (or non-API)
|
|
|
|
To avoid having to write the same error-checking everywhere, this takes
|
|
care of retrying on failure, et cetera
|
|
|
|
:param session: Requests session
|
|
:param str method: GET or POST
|
|
:param str url: URL to fetch
|
|
:param dict header: Headers to pass with the request
|
|
:param dict data: Data/params to send with the request
|
|
|
|
:return: Requests response
|
|
"""
|
|
retries = 0
|
|
response = None
|
|
while retries < 3:
|
|
try:
|
|
if method.lower() == "post":
|
|
request = session.post(url, headers=headers, data=data)
|
|
elif method.lower() == "get":
|
|
request = session.get(url, headers=headers, params=data)
|
|
else:
|
|
raise NotImplemented()
|
|
|
|
if request.status_code >= 300:
|
|
raise ValueError(
|
|
"Response %i from BitChute for URL %s, need to retry"
|
|
% (request.status_code, url)
|
|
)
|
|
|
|
response = request.json()
|
|
return response
|
|
|
|
except (ConnectionResetError, requests.RequestException, ValueError) as e:
|
|
retries += 1
|
|
time.sleep(retries * 2)
|
|
|
|
except json.JSONDecodeError as e:
|
|
raise RuntimeError()
|
|
|
|
if not response:
|
|
raise RuntimeError()
|
|
|
|
return response
|
|
|
|
|
|
def append_details(video, detail):
|
|
"""
|
|
Append extra metadata to video data
|
|
|
|
Fetches the BitChute video detail page to scrape extra data for the given video.
|
|
|
|
:param dict video: Video details as scraped so far
|
|
:param str detail: Detail level. If 'comments', also scrape video comments.
|
|
|
|
:return dict: Tuple, first item: updated video data, second: list of comments
|
|
"""
|
|
comments = []
|
|
|
|
video = {
|
|
**video,
|
|
"likes": "",
|
|
"dislikes": "",
|
|
"channel_subscribers": "",
|
|
"comments": "",
|
|
"hashtags": "",
|
|
"parent_id": "",
|
|
"video_url": "",
|
|
}
|
|
|
|
try:
|
|
# to get more details per video, we need to request the actual video detail page
|
|
# start a new session, to not interfere with the CSRF token from the search session
|
|
video_session = requests.session()
|
|
video_page = video_session.get(video["url"])
|
|
|
|
if (
|
|
'<h1 class="page-title">Video Restricted</h1>' in video_page.text
|
|
or '<h1 class="page-title">Video Blocked</h1>' in video_page.text
|
|
or '<h1 class="page-title">Channel Blocked</h1>' in video_page.text
|
|
or '<h1 class="page-title">Channel Restricted</h1>' in video_page.text
|
|
):
|
|
if (
|
|
"This video is unavailable as the contents have been deemed potentially illegal"
|
|
in video_page.text
|
|
):
|
|
video["category"] = "moderated-illegal"
|
|
return (video, [])
|
|
|
|
elif (
|
|
"Viewing of this video is restricted, as it has been marked as Not Safe For Life"
|
|
in video_page.text
|
|
):
|
|
video["category"] = "moderated-nsfl"
|
|
return (video, [])
|
|
|
|
elif "Contains Incitement to Hatred" in video_page.text:
|
|
video["category"] = "moderated-incitement"
|
|
return (video, [])
|
|
|
|
elif "Platform Misuse" in video_page.text:
|
|
video["category"] = "moderated-misuse"
|
|
return (video, [])
|
|
|
|
elif "Terrorism & Violent Extremism" in video_page.text:
|
|
video["category"] = "moderated-terrorism-extremism"
|
|
return (video, [])
|
|
|
|
elif "Copyright</h4>" in video_page.text:
|
|
video["category"] = "moderated-copyright"
|
|
return (video, [])
|
|
|
|
else:
|
|
video["category"] = "moderated-other"
|
|
return (video, [])
|
|
|
|
elif '<iframe class="rumble"' in video_page.text:
|
|
# some videos are actually embeds from rumble?
|
|
# these are iframes, so at the moment we cannot simply extract
|
|
# their info from the page, so we skip them. In the future we
|
|
# could add an extra request to get the relevant info, but so
|
|
# far the only examples I've seen are actually 'video not found'
|
|
video = {**video, "category": "error-embed-from-rumble"}
|
|
return (video, [])
|
|
|
|
elif video_page.status_code != 200:
|
|
video = {**video, "category": "error-%i" % video_page.status_code}
|
|
return (video, [])
|
|
|
|
soup = BeautifulSoup(video_page.text, "html.parser")
|
|
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get(
|
|
"value"
|
|
)
|
|
|
|
video["video_url"] = soup.select_one("video#player source").get("src")
|
|
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
|
video["subject"] = soup.select_one("h1#video-title").text
|
|
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
|
|
video["author"] = (
|
|
soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
|
)
|
|
video["body"] = (
|
|
soup.select_one("div#video-description")
|
|
.encode_contents()
|
|
.decode("utf-8")
|
|
.strip()
|
|
)
|
|
|
|
# we need *two more requests* to get the comment count and like/dislike counts
|
|
# this seems to be because bitchute uses a third-party comment widget
|
|
video_session.headers = {"Referer": video["url"], "Origin": video["url"]}
|
|
counts = request_from_bitchute(
|
|
video_session,
|
|
"POST",
|
|
"https://www.bitchute.com/video/%s/counts/" % video["id"],
|
|
data={"csrfmiddlewaretoken": video_csfrtoken},
|
|
)
|
|
|
|
if detail == "comments":
|
|
# if comments are also to be scraped, this is anothe request to make, which returns
|
|
# a convenient JSON response with all the comments to the video
|
|
# we need yet another token for this, which we can extract from a bit of inline
|
|
# javascript on the page
|
|
comment_script = None
|
|
for line in video_page.text.split("\n"):
|
|
if "initComments(" in line:
|
|
comment_script = line.split("initComments(")[1]
|
|
break
|
|
|
|
if not comment_script:
|
|
# no script to extract comments from, cannot load
|
|
comment_count = -1
|
|
else:
|
|
# make the request
|
|
comment_count = 0
|
|
url = comment_script.split("'")[1]
|
|
comment_csrf = comment_script.split("'")[3]
|
|
comments_data = request_from_bitchute(
|
|
video_session,
|
|
"POST",
|
|
url + "/api/get_comments/",
|
|
data={"cf_auth": comment_csrf, "commentCount": 0},
|
|
)
|
|
|
|
for comment in comments_data:
|
|
comment_count += 1
|
|
|
|
if comment.get("profile_picture_url", None):
|
|
thumbnail_image = url + comment.get("profile_picture_url")
|
|
else:
|
|
thumbnail_image = ""
|
|
|
|
comments.append(
|
|
{
|
|
"id": comment["id"],
|
|
"thread_id": video["id"],
|
|
"subject": "",
|
|
"body": comment["content"],
|
|
"author": comment["fullname"],
|
|
"author_id": comment["creator"],
|
|
"timestamp": int(
|
|
dateparser.parse(comment["created"]).timestamp()
|
|
),
|
|
"url": "",
|
|
"views": "",
|
|
"length": "",
|
|
"hashtags": "",
|
|
"thumbnail_image": thumbnail_image,
|
|
"likes": comment["upvote_count"],
|
|
"category": "comment",
|
|
"dislikes": "",
|
|
"channel_subscribers": "",
|
|
"comments": "",
|
|
"parent_id": comment.get("parent", "")
|
|
if "parent" in comment
|
|
else video["id"],
|
|
}
|
|
)
|
|
|
|
else:
|
|
# if we don't need the full comments, we still need another request to get the *amount*
|
|
# of comments,
|
|
comment_count = request_from_bitchute(
|
|
video_session,
|
|
"POST",
|
|
"https://commentfreely.bitchute.com/api/get_comment_count/",
|
|
data={
|
|
"csrfmiddlewaretoken": video_csfrtoken,
|
|
"cf_thread": "bc_" + video["id"],
|
|
},
|
|
)["commentCount"]
|
|
|
|
except RuntimeError as e:
|
|
# we wrap this in one big try-catch because doing it for each request separarely is tedious
|
|
# hm... maybe this should be in a helper function
|
|
# self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
|
|
# is_final=True)
|
|
return (None, None)
|
|
|
|
# again, no structured info available for the publication date, but at least we can extract the
|
|
# exact day it was uploaded
|
|
try:
|
|
published = dateparser.parse(
|
|
soup.find(class_="video-publish-date")
|
|
.text.split("published at")[1]
|
|
.strip()[:-1]
|
|
)
|
|
except AttributeError as e:
|
|
# publication date not on page?
|
|
published = None
|
|
|
|
# merge data
|
|
video = {
|
|
**video,
|
|
"category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
|
|
"likes": counts["like_count"],
|
|
"dislikes": counts["dislike_count"],
|
|
"channel_subscribers": counts["subscriber_count"],
|
|
"comments": comment_count,
|
|
"parent_id": "",
|
|
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
|
|
"views": counts["view_count"],
|
|
}
|
|
|
|
if published:
|
|
video["timestamp"] = int(published.timestamp())
|
|
|
|
# may need to be increased? bitchute doesn't seem particularly strict
|
|
time.sleep(0.25)
|
|
return (video, comments)
|
|
|
|
|
|
def get_videos_user(session, user, csrftoken, detail):
|
|
"""
|
|
Scrape videos for given BitChute user
|
|
|
|
:param session: HTTP Session to use
|
|
:param str user: Username to scrape videos for
|
|
:param str csrftoken: CSRF token to use for requests
|
|
:param str detail: Detail level to scrape, basic/detail/comments
|
|
|
|
:return: Video data dictionaries, as a generator
|
|
"""
|
|
max_items = 100
|
|
num_items = 0
|
|
offset = 0
|
|
|
|
base_url = "https://www.bitchute.com/channel/%s/" % user
|
|
url = base_url + "extend/"
|
|
|
|
container = session.get(base_url)
|
|
container_soup = BeautifulSoup(container.text, "html.parser")
|
|
headers = {"Referer": base_url, "Origin": "https://www.bitchute.com/"}
|
|
|
|
while True:
|
|
post_data = {
|
|
"csrfmiddlewaretoken": csrftoken,
|
|
"name": "",
|
|
"offset": str(offset),
|
|
}
|
|
|
|
response = request_from_bitchute(
|
|
session, "POST", url, headers=headers, data=post_data
|
|
)
|
|
|
|
soup = BeautifulSoup(response["html"], "html.parser")
|
|
videos = soup.select(".channel-videos-container")
|
|
comments = []
|
|
|
|
if len(videos) == 0 or num_items >= max_items:
|
|
break
|
|
|
|
for video_element in videos:
|
|
if num_items >= max_items:
|
|
break
|
|
else:
|
|
num_items += 1
|
|
|
|
offset += 1
|
|
|
|
link = video_element.select_one(".channel-videos-title a")
|
|
video = {
|
|
"id": link["href"].split("/")[-2],
|
|
"thread_id": link["href"].split("/")[-2],
|
|
"subject": link.text,
|
|
"body": strip_tags(
|
|
video_element.select_one(".channel-videos-text").text
|
|
),
|
|
"author": container_soup.select_one(".details .name a").text,
|
|
"author_id": container_soup.select_one(".details .name a")[
|
|
"href"
|
|
].split("/")[2],
|
|
"timestamp": int(
|
|
dateparser.parse(
|
|
video_element.select_one(
|
|
".channel-videos-details.text-right.hidden-xs"
|
|
).text
|
|
).timestamp()
|
|
),
|
|
"url": "https://www.bitchute.com" + link["href"],
|
|
"views": video_element.select_one(".video-views").text.strip(),
|
|
"length": video_element.select_one(".video-duration").text.strip(),
|
|
"thumbnail_image": video_element.select_one(
|
|
".channel-videos-image img"
|
|
)["src"],
|
|
}
|
|
|
|
if detail != "basic":
|
|
video, comments = append_details(video, detail)
|
|
if not video:
|
|
# unrecoverable error while scraping details
|
|
return
|
|
|
|
yield video
|
|
for comment in comments:
|
|
# these need to be yielded *after* the video because else the result file will have the comments
|
|
# before the video, which is weird
|
|
yield comment
|
|
|
|
|
|
def decode_cfemail(cfemail):
|
|
"""https://stackoverflow.com/questions/36911296/scraping-of-protected-email"""
|
|
|
|
email = ""
|
|
k = int(cfemail[:2], 16)
|
|
|
|
for i in range(2, len(cfemail) - 1, 2):
|
|
email += chr(int(cfemail[i : i + 2], 16) ^ k)
|
|
|
|
return email
|