implemented Bitchute scraper

This commit is contained in:
Tristan Lee
2022-02-18 12:45:10 -06:00
parent 4668d4df11
commit 139459e3b2
6 changed files with 614 additions and 19 deletions

View File

@@ -8,6 +8,9 @@ sqlalchemy = "*"
snscrape = "*"
loguru = "*"
gogettr = "*"
requests = "*"
bs4 = "*"
dateparser = "*"
[dev-packages]

148
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb"
"sha256": "ca7eea4b95394e06f8b74eac90d376097fd01231010b594cdcc588a3440f1231"
},
"pipfile-spec": 6,
"requires": {
@@ -24,6 +24,13 @@
"markers": "python_version >= '3.1'",
"version": "==4.10.0"
},
"bs4": {
"hashes": [
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
],
"index": "pypi",
"version": "==0.0.1"
},
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
@@ -47,6 +54,14 @@
"markers": "python_version >= '3.6'",
"version": "==8.0.3"
},
"dateparser": {
"hashes": [
"sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044",
"sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0"
],
"index": "pypi",
"version": "==1.1.0"
},
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -215,15 +230,124 @@
],
"version": "==1.7.1"
},
"python-dateutil": {
"hashes": [
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.8.2"
},
"pytz": {
"hashes": [
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
],
"version": "==2021.3"
},
"pytz-deprecation-shim": {
"hashes": [
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==0.1.0.post0"
},
"regex": {
"hashes": [
"sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87",
"sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52",
"sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3",
"sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288",
"sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f",
"sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c",
"sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184",
"sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f",
"sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8",
"sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02",
"sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3",
"sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38",
"sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d",
"sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633",
"sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4",
"sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5",
"sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202",
"sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3",
"sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118",
"sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d",
"sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729",
"sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed",
"sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607",
"sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c",
"sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a",
"sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75",
"sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899",
"sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0",
"sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832",
"sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9",
"sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a",
"sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6",
"sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1",
"sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68",
"sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e",
"sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74",
"sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7",
"sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3",
"sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4",
"sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4",
"sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b",
"sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c",
"sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101",
"sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a",
"sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1",
"sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7",
"sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d",
"sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605",
"sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d",
"sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916",
"sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949",
"sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6",
"sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3",
"sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6",
"sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9",
"sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af",
"sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59",
"sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f",
"sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2",
"sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298",
"sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4",
"sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c",
"sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc",
"sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a",
"sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43",
"sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a",
"sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb",
"sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093",
"sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8",
"sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52",
"sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442",
"sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338",
"sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f",
"sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"
],
"version": "==2022.1.18"
},
"requests": {
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"index": "pypi",
"version": "==2.27.1"
},
"six": {
"hashes": [
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
@@ -282,12 +406,28 @@
"index": "pypi",
"version": "==1.4.31"
},
"tzdata": {
"hashes": [
"sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
"sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"
],
"markers": "python_version >= '3.6'",
"version": "==2021.5"
},
"tzlocal": {
"hashes": [
"sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09",
"sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f"
],
"markers": "python_version >= '3.6'",
"version": "==4.1"
},
"urllib3": {
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
}
},

View File

@@ -0,0 +1,445 @@
from datetime import datetime
import time
import re
from html.parser import HTMLParser
import dateparser
import json
from typing import List
import requests
from bs4 import BeautifulSoup
import cisticola.base
class BitchuteScraper(cisticola.scraper.Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
__version__ = "BitchuteScraper 0.0.1"
# TODO snscrape should be able to scrape from user ID alone, but there is
# currently a bug/other issue, so it is extracting the username from URL
def get_username_from_url(url):
username = url.split('bitchute.com/channel/')[-1].strip('/')
return username
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
request = session.get("https://www.bitchute.com/search")
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
time.sleep(0.25)
# Don't scrape comment information
#TODO implement framework for processing and storing comments
detail = 'basic'
posts = []
username = BitchuteScraper.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for i, post in enumerate(scraper):
if since is not None and post['timestamp'] <= since.date_archived.timestamp():
print( f'\n\nBREAK ON VIDEO: {i}\n\n')
break
posts.append(cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Bitchute",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
return True
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def strip_tags(html, convert_newlines=True):
"""
Strip HTML from a string
:param html: HTML to strip
:param convert_newlines: Convert <br> and </p> tags to \n before stripping
:return: Stripped HTML
"""
if not html:
return ""
deduplicate_newlines = re.compile(r"\n+")
if convert_newlines:
html = html.replace("<br>", "\n").replace("</p>", "</p>\n")
html = deduplicate_newlines.sub("\n", html)
class HTMLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.fed = []
def handle_data(self, data):
self.fed.append(data)
def get_data(self):
return "".join(self.fed)
stripper = HTMLStripper()
stripper.feed(html)
return stripper.get_data()
#-----------------------------------------------------------------------------#
def request_from_bitchute(session, method, url, headers=None, data=None):
"""
Request something via the BitChute API (or non-API)
To avoid having to write the same error-checking everywhere, this takes
care of retrying on failure, et cetera
:param session: Requests session
:param str method: GET or POST
:param str url: URL to fetch
:param dict header: Headers to pass with the request
:param dict data: Data/params to send with the request
:return: Requests response
"""
retries = 0
response = None
while retries < 3:
try:
if method.lower() == "post":
request = session.post(url, headers=headers, data=data)
elif method.lower() == "get":
request = session.get(url, headers=headers, params=data)
else:
raise NotImplemented()
if request.status_code >= 300:
raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url))
response = request.json()
return response
except (ConnectionResetError, requests.RequestException, ValueError) as e:
retries += 1
time.sleep(retries * 2)
except json.JSONDecodeError as e:
raise RuntimeError()
if not response:
raise RuntimeError()
return response
#-----------------------------------------------------------------------------#
def append_details(video, detail):
"""
Append extra metadata to video data
Fetches the BitChute video detail page to scrape extra data for the given video.
:param dict video: Video details as scraped so far
:param str detail: Detail level. If 'comments', also scrape video comments.
:return dict: Tuple, first item: updated video data, second: list of comments
"""
comments = []
video = {
**video,
"likes": "",
"dislikes": "",
"channel_subscribers": "",
"comments": "",
"hashtags": "",
"parent_id": "",
"video_url": ""
}
try:
# to get more details per video, we need to request the actual video detail page
# start a new session, to not interfere with the CSRF token from the search session
video_session = requests.session()
video_page = video_session.get(video["url"])
if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
"<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
"<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
"<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
video["category"] = "moderated-illegal"
return (video, [])
elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
video["category"] = "moderated-nsfl"
return (video, [])
elif "Contains Incitement to Hatred" in video_page.text:
video["category"] = "moderated-incitement"
return (video, [])
elif "Platform Misuse" in video_page.text:
video["category"] = "moderated-misuse"
return (video, [])
elif "Terrorism &amp; Violent Extremism" in video_page.text:
video["category"] = "moderated-terrorism-extremism"
return (video, [])
elif "Copyright</h4>" in video_page.text:
video["category"] = "moderated-copyright"
return (video, [])
else:
video["category"] = "moderated-other"
return (video, [])
elif "<iframe class=\"rumble\"" in video_page.text:
# some videos are actually embeds from rumble?
# these are iframes, so at the moment we cannot simply extract
# their info from the page, so we skip them. In the future we
# could add an extra request to get the relevant info, but so
# far the only examples I've seen are actually 'video not found'
video = {
**video,
"category": "error-embed-from-rumble"
}
return (video, [])
elif video_page.status_code != 200:
video = {
**video,
"category": "error-%i" % video_page.status_code
}
return (video, [])
soup = BeautifulSoup(video_page.text, 'html.parser')
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
video["video_url"] = soup.select_one("video#player source").get("src")
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
video["subject"] = soup.select_one("h1#video-title").text
video["author"] = soup.select_one("div.channel-banner p.name a").text
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
# we need *two more requests* to get the comment count and like/dislike counts
# this seems to be because bitchute uses a third-party comment widget
video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
if detail == "comments":
# if comments are also to be scraped, this is anothe request to make, which returns
# a convenient JSON response with all the comments to the video
# we need yet another token for this, which we can extract from a bit of inline
# javascript on the page
comment_script = None
for line in video_page.text.split("\n"):
if "initComments(" in line:
comment_script = line.split("initComments(")[1]
break
if not comment_script:
# no script to extract comments from, cannot load
comment_count = -1
else:
# make the request
comment_count = 0
url = comment_script.split("'")[1]
comment_csrf = comment_script.split("'")[3]
comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
for comment in comments_data:
comment_count += 1
if comment.get("profile_picture_url", None):
thumbnail_image = url + comment.get("profile_picture_url")
else:
thumbnail_image = ""
comments.append({
"id": comment["id"],
"thread_id": video["id"],
"subject": "",
"body": comment["content"],
"author": comment["fullname"],
"author_id": comment["creator"],
"timestamp": int(dateparser.parse(comment["created"]).timestamp()),
"url": "",
"views": "",
"length": "",
"hashtags": "",
"thumbnail_image": thumbnail_image,
"likes": comment["upvote_count"],
"category": "comment",
"dislikes": "",
"channel_subscribers": "",
"comments": "",
"parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
})
else:
# if we don't need the full comments, we still need another request to get the *amount*
# of comments,
comment_count = request_from_bitchute(video_session, "POST",
"https://commentfreely.bitchute.com/api/get_comment_count/",
data={"csrfmiddlewaretoken": video_csfrtoken,
"cf_thread": "bc_" + video["id"]})["commentCount"]
except RuntimeError as e:
# we wrap this in one big try-catch because doing it for each request separarely is tedious
# hm... maybe this should be in a helper function
# self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
# is_final=True)
return (None, None)
# again, no structured info available for the publication date, but at least we can extract the
# exact day it was uploaded
try:
published = dateparser.parse(
soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
except AttributeError as e:
# publication date not on page?
published = None
# merge data
video = {
**video,
"category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
"likes": counts["like_count"],
"dislikes": counts["dislike_count"],
"channel_subscribers": counts["subscriber_count"],
"comments": comment_count,
"parent_id": "",
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
"views": counts["view_count"]
}
if published:
video["timestamp"] = int(published.timestamp())
# may need to be increased? bitchute doesn't seem particularly strict
time.sleep(0.25)
return (video, comments)
#-----------------------------------------------------------------------------#
def get_videos_user(session, user, csrftoken, detail):
"""
Scrape videos for given BitChute user
:param session: HTTP Session to use
:param str user: Username to scrape videos for
:param str csrftoken: CSRF token to use for requests
:param str detail: Detail level to scrape, basic/detail/comments
:return: Video data dictionaries, as a generator
"""
max_items = 100
num_items = 0
offset = 0
base_url = "https://www.bitchute.com/channel/%s/" % user
url = base_url + "extend/"
container = session.get(base_url)
container_soup = BeautifulSoup(container.text, 'html.parser')
headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
while True:
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
try:
request = session.post(url, data=post_data, headers=headers)
if request.status_code != 200:
raise ConnectionError()
response = request.json()
except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
raise ValueError('FALSE')
soup = BeautifulSoup(response["html"], 'html.parser')
videos = soup.select(".channel-videos-container")
comments = []
if len(videos) == 0 or num_items >= max_items:
break
for video_element in videos:
if num_items >= max_items:
break
else:
num_items += 1
offset += 1
link = video_element.select_one(".channel-videos-title a")
video = {
"id": link["href"].split("/")[-2],
"thread_id": link["href"].split("/")[-2],
"subject": link.text,
"body": strip_tags(video_element.select_one(".channel-videos-text").text),
"author": container_soup.select_one(".details .name a").text,
"author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
"timestamp": int(
dateparser.parse(
video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
"url": "https://www.bitchute.com" + link["href"],
"views": video_element.select_one(".video-views").text.strip(),
"length": video_element.select_one(".video-duration").text.strip(),
"thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
}
if detail != "basic":
video, comments = append_details(video, detail)
if not video:
# unrecoverable error while scraping details
return
yield video
for comment in comments:
# these need to be yielded *after* the video because else the result file will have the comments
# before the video, which is weird
yield comment
#-----------------------------------------------------------------------------#
def get_about(user):
"""
Extract fields from channel's "About" tab
"""
base_url = "https://www.bitchute.com/channel/%s/" % user
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
about_soup = soup.find('div', {'id' : 'channel-about'})
info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p')
description_soup = about_soup.find('div', {'id' : 'channel-description'})
about = {
'description' : description_soup.text,
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : soup.find('p', {'class' : 'owner'}).text,
'category' : info_list[-1].text.split('Category')[1].strip(),
'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src']
}
return about

View File

@@ -25,13 +25,14 @@ class GettrScraper(cisticola.scraper.Scraper):
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
break
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
platform="Gettr",
channel=username,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
posts.append(cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Gettr",
channel=username,
platform_id=post['_id'],
date=datetime.fromtimestamp(post['cdate']/1000.),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts

View File

@@ -23,16 +23,17 @@ class TwitterScraper(cisticola.scraper.Scraper):
TwitterScraper.get_username_from_url(channel.url))
for tweet in scraper.get_items():
if since is not None and tweet.id <= int(since.platform_id):
if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp():
break
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
posts.append(cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Twitter",
channel=channel.id,
platform_id=tweet.id,
date=tweet.date,
date_archived=datetime.now(),
raw_data=tweet.json()))
return posts

View File

@@ -4,6 +4,7 @@
import cisticola
import cisticola.scraper.twitter
from sqlalchemy import create_engine
@@ -19,6 +20,10 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
category="qanon", followers=None, platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic", country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
category="nazi", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
influencer=None, public=True, chat=False, notes=""),]