mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
implemented Bitchute scraper
This commit is contained in:
3
Pipfile
3
Pipfile
@@ -8,6 +8,9 @@ sqlalchemy = "*"
|
||||
snscrape = "*"
|
||||
loguru = "*"
|
||||
gogettr = "*"
|
||||
requests = "*"
|
||||
bs4 = "*"
|
||||
dateparser = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
||||
148
Pipfile.lock
generated
148
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb"
|
||||
"sha256": "ca7eea4b95394e06f8b74eac90d376097fd01231010b594cdcc588a3440f1231"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -24,6 +24,13 @@
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==4.10.0"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.0.1"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
@@ -47,6 +54,14 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==8.0.3"
|
||||
},
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
"sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044",
|
||||
"sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
|
||||
@@ -215,15 +230,124 @@
|
||||
],
|
||||
"version": "==1.7.1"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
|
||||
"sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.8.2"
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
|
||||
"sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
|
||||
],
|
||||
"version": "==2021.3"
|
||||
},
|
||||
"pytz-deprecation-shim": {
|
||||
"hashes": [
|
||||
"sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
|
||||
"sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"version": "==0.1.0.post0"
|
||||
},
|
||||
"regex": {
|
||||
"hashes": [
|
||||
"sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87",
|
||||
"sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52",
|
||||
"sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3",
|
||||
"sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288",
|
||||
"sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f",
|
||||
"sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c",
|
||||
"sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184",
|
||||
"sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f",
|
||||
"sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8",
|
||||
"sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02",
|
||||
"sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3",
|
||||
"sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38",
|
||||
"sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d",
|
||||
"sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633",
|
||||
"sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4",
|
||||
"sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5",
|
||||
"sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202",
|
||||
"sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3",
|
||||
"sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118",
|
||||
"sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d",
|
||||
"sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729",
|
||||
"sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed",
|
||||
"sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607",
|
||||
"sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c",
|
||||
"sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a",
|
||||
"sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75",
|
||||
"sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899",
|
||||
"sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0",
|
||||
"sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832",
|
||||
"sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9",
|
||||
"sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a",
|
||||
"sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6",
|
||||
"sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1",
|
||||
"sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68",
|
||||
"sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e",
|
||||
"sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74",
|
||||
"sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7",
|
||||
"sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3",
|
||||
"sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4",
|
||||
"sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4",
|
||||
"sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b",
|
||||
"sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c",
|
||||
"sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101",
|
||||
"sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a",
|
||||
"sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1",
|
||||
"sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7",
|
||||
"sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d",
|
||||
"sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605",
|
||||
"sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d",
|
||||
"sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916",
|
||||
"sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949",
|
||||
"sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6",
|
||||
"sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3",
|
||||
"sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6",
|
||||
"sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9",
|
||||
"sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af",
|
||||
"sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59",
|
||||
"sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f",
|
||||
"sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2",
|
||||
"sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298",
|
||||
"sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4",
|
||||
"sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c",
|
||||
"sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc",
|
||||
"sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a",
|
||||
"sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43",
|
||||
"sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a",
|
||||
"sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb",
|
||||
"sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093",
|
||||
"sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8",
|
||||
"sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52",
|
||||
"sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442",
|
||||
"sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338",
|
||||
"sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f",
|
||||
"sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"
|
||||
],
|
||||
"version": "==2022.1.18"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"snscrape": {
|
||||
"hashes": [
|
||||
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
|
||||
@@ -282,12 +406,28 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.4.31"
|
||||
},
|
||||
"tzdata": {
|
||||
"hashes": [
|
||||
"sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
|
||||
"sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2021.5"
|
||||
},
|
||||
"tzlocal": {
|
||||
"hashes": [
|
||||
"sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09",
|
||||
"sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==4.1"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
|
||||
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
|
||||
"version": "==1.26.8"
|
||||
}
|
||||
},
|
||||
|
||||
445
cisticola/scraper/bitchute.py
Normal file
445
cisticola/scraper/bitchute.py
Normal file
@@ -0,0 +1,445 @@
|
||||
from datetime import datetime
|
||||
import time
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
import dateparser
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import cisticola.base
|
||||
|
||||
class BitchuteScraper(cisticola.scraper.Scraper):
|
||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
|
||||
# TODO snscrape should be able to scrape from user ID alone, but there is
|
||||
# currently a bug/other issue, so it is extracting the username from URL
|
||||
def get_username_from_url(url):
|
||||
username = url.split('bitchute.com/channel/')[-1].strip('/')
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||
request = session.get("https://www.bitchute.com/search")
|
||||
csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
|
||||
"input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
||||
time.sleep(0.25)
|
||||
|
||||
# Don't scrape comment information
|
||||
#TODO implement framework for processing and storing comments
|
||||
detail = 'basic'
|
||||
|
||||
posts = []
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||
|
||||
for i, post in enumerate(scraper):
|
||||
|
||||
if since is not None and post['timestamp'] <= since.date_archived.timestamp():
|
||||
print( f'\n\nBREAK ON VIDEO: {i}\n\n')
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Bitchute",
|
||||
channel=channel.id,
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromtimestamp(post['timestamp']),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
|
||||
return posts
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def strip_tags(html, convert_newlines=True):
|
||||
"""
|
||||
Strip HTML from a string
|
||||
|
||||
:param html: HTML to strip
|
||||
:param convert_newlines: Convert <br> and </p> tags to \n before stripping
|
||||
:return: Stripped HTML
|
||||
"""
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
deduplicate_newlines = re.compile(r"\n+")
|
||||
|
||||
if convert_newlines:
|
||||
html = html.replace("<br>", "\n").replace("</p>", "</p>\n")
|
||||
html = deduplicate_newlines.sub("\n", html)
|
||||
|
||||
class HTMLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.reset()
|
||||
self.strict = False
|
||||
self.convert_charrefs = True
|
||||
self.fed = []
|
||||
|
||||
def handle_data(self, data):
|
||||
self.fed.append(data)
|
||||
|
||||
def get_data(self):
|
||||
return "".join(self.fed)
|
||||
|
||||
stripper = HTMLStripper()
|
||||
stripper.feed(html)
|
||||
return stripper.get_data()
|
||||
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
def request_from_bitchute(session, method, url, headers=None, data=None):
|
||||
"""
|
||||
Request something via the BitChute API (or non-API)
|
||||
|
||||
To avoid having to write the same error-checking everywhere, this takes
|
||||
care of retrying on failure, et cetera
|
||||
|
||||
:param session: Requests session
|
||||
:param str method: GET or POST
|
||||
:param str url: URL to fetch
|
||||
:param dict header: Headers to pass with the request
|
||||
:param dict data: Data/params to send with the request
|
||||
|
||||
:return: Requests response
|
||||
"""
|
||||
retries = 0
|
||||
response = None
|
||||
while retries < 3:
|
||||
try:
|
||||
if method.lower() == "post":
|
||||
request = session.post(url, headers=headers, data=data)
|
||||
elif method.lower() == "get":
|
||||
request = session.get(url, headers=headers, params=data)
|
||||
else:
|
||||
raise NotImplemented()
|
||||
|
||||
if request.status_code >= 300:
|
||||
raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url))
|
||||
|
||||
response = request.json()
|
||||
return response
|
||||
|
||||
except (ConnectionResetError, requests.RequestException, ValueError) as e:
|
||||
retries += 1
|
||||
time.sleep(retries * 2)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
raise RuntimeError()
|
||||
|
||||
if not response:
|
||||
raise RuntimeError()
|
||||
|
||||
return response
|
||||
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
def append_details(video, detail):
|
||||
"""
|
||||
Append extra metadata to video data
|
||||
|
||||
Fetches the BitChute video detail page to scrape extra data for the given video.
|
||||
|
||||
:param dict video: Video details as scraped so far
|
||||
:param str detail: Detail level. If 'comments', also scrape video comments.
|
||||
|
||||
:return dict: Tuple, first item: updated video data, second: list of comments
|
||||
"""
|
||||
comments = []
|
||||
|
||||
video = {
|
||||
**video,
|
||||
"likes": "",
|
||||
"dislikes": "",
|
||||
"channel_subscribers": "",
|
||||
"comments": "",
|
||||
"hashtags": "",
|
||||
"parent_id": "",
|
||||
"video_url": ""
|
||||
}
|
||||
|
||||
try:
|
||||
# to get more details per video, we need to request the actual video detail page
|
||||
# start a new session, to not interfere with the CSRF token from the search session
|
||||
video_session = requests.session()
|
||||
video_page = video_session.get(video["url"])
|
||||
|
||||
if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
|
||||
"<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
|
||||
"<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
|
||||
"<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
|
||||
if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
|
||||
video["category"] = "moderated-illegal"
|
||||
return (video, [])
|
||||
|
||||
elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
|
||||
video["category"] = "moderated-nsfl"
|
||||
return (video, [])
|
||||
|
||||
elif "Contains Incitement to Hatred" in video_page.text:
|
||||
video["category"] = "moderated-incitement"
|
||||
return (video, [])
|
||||
|
||||
elif "Platform Misuse" in video_page.text:
|
||||
video["category"] = "moderated-misuse"
|
||||
return (video, [])
|
||||
|
||||
elif "Terrorism & Violent Extremism" in video_page.text:
|
||||
video["category"] = "moderated-terrorism-extremism"
|
||||
return (video, [])
|
||||
|
||||
elif "Copyright</h4>" in video_page.text:
|
||||
video["category"] = "moderated-copyright"
|
||||
return (video, [])
|
||||
|
||||
else:
|
||||
video["category"] = "moderated-other"
|
||||
return (video, [])
|
||||
|
||||
elif "<iframe class=\"rumble\"" in video_page.text:
|
||||
# some videos are actually embeds from rumble?
|
||||
# these are iframes, so at the moment we cannot simply extract
|
||||
# their info from the page, so we skip them. In the future we
|
||||
# could add an extra request to get the relevant info, but so
|
||||
# far the only examples I've seen are actually 'video not found'
|
||||
video = {
|
||||
**video,
|
||||
"category": "error-embed-from-rumble"
|
||||
}
|
||||
return (video, [])
|
||||
|
||||
elif video_page.status_code != 200:
|
||||
video = {
|
||||
**video,
|
||||
"category": "error-%i" % video_page.status_code
|
||||
}
|
||||
return (video, [])
|
||||
|
||||
soup = BeautifulSoup(video_page.text, 'html.parser')
|
||||
video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
|
||||
|
||||
video["video_url"] = soup.select_one("video#player source").get("src")
|
||||
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
||||
video["subject"] = soup.select_one("h1#video-title").text
|
||||
video["author"] = soup.select_one("div.channel-banner p.name a").text
|
||||
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
|
||||
|
||||
# we need *two more requests* to get the comment count and like/dislike counts
|
||||
# this seems to be because bitchute uses a third-party comment widget
|
||||
video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
|
||||
counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
|
||||
|
||||
if detail == "comments":
|
||||
# if comments are also to be scraped, this is anothe request to make, which returns
|
||||
# a convenient JSON response with all the comments to the video
|
||||
# we need yet another token for this, which we can extract from a bit of inline
|
||||
# javascript on the page
|
||||
comment_script = None
|
||||
for line in video_page.text.split("\n"):
|
||||
if "initComments(" in line:
|
||||
comment_script = line.split("initComments(")[1]
|
||||
break
|
||||
|
||||
if not comment_script:
|
||||
# no script to extract comments from, cannot load
|
||||
comment_count = -1
|
||||
else:
|
||||
# make the request
|
||||
comment_count = 0
|
||||
url = comment_script.split("'")[1]
|
||||
comment_csrf = comment_script.split("'")[3]
|
||||
comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
|
||||
|
||||
for comment in comments_data:
|
||||
comment_count += 1
|
||||
|
||||
if comment.get("profile_picture_url", None):
|
||||
thumbnail_image = url + comment.get("profile_picture_url")
|
||||
else:
|
||||
thumbnail_image = ""
|
||||
|
||||
comments.append({
|
||||
"id": comment["id"],
|
||||
"thread_id": video["id"],
|
||||
"subject": "",
|
||||
"body": comment["content"],
|
||||
"author": comment["fullname"],
|
||||
"author_id": comment["creator"],
|
||||
"timestamp": int(dateparser.parse(comment["created"]).timestamp()),
|
||||
"url": "",
|
||||
"views": "",
|
||||
"length": "",
|
||||
"hashtags": "",
|
||||
"thumbnail_image": thumbnail_image,
|
||||
"likes": comment["upvote_count"],
|
||||
"category": "comment",
|
||||
"dislikes": "",
|
||||
"channel_subscribers": "",
|
||||
"comments": "",
|
||||
"parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
|
||||
})
|
||||
|
||||
else:
|
||||
# if we don't need the full comments, we still need another request to get the *amount*
|
||||
# of comments,
|
||||
comment_count = request_from_bitchute(video_session, "POST",
|
||||
"https://commentfreely.bitchute.com/api/get_comment_count/",
|
||||
data={"csrfmiddlewaretoken": video_csfrtoken,
|
||||
"cf_thread": "bc_" + video["id"]})["commentCount"]
|
||||
|
||||
except RuntimeError as e:
|
||||
# we wrap this in one big try-catch because doing it for each request separarely is tedious
|
||||
# hm... maybe this should be in a helper function
|
||||
# self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
|
||||
# is_final=True)
|
||||
return (None, None)
|
||||
|
||||
# again, no structured info available for the publication date, but at least we can extract the
|
||||
# exact day it was uploaded
|
||||
try:
|
||||
published = dateparser.parse(
|
||||
soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
|
||||
except AttributeError as e:
|
||||
# publication date not on page?
|
||||
published = None
|
||||
|
||||
# merge data
|
||||
video = {
|
||||
**video,
|
||||
"category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
|
||||
"likes": counts["like_count"],
|
||||
"dislikes": counts["dislike_count"],
|
||||
"channel_subscribers": counts["subscriber_count"],
|
||||
"comments": comment_count,
|
||||
"parent_id": "",
|
||||
"hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
|
||||
"views": counts["view_count"]
|
||||
}
|
||||
|
||||
if published:
|
||||
video["timestamp"] = int(published.timestamp())
|
||||
|
||||
# may need to be increased? bitchute doesn't seem particularly strict
|
||||
time.sleep(0.25)
|
||||
return (video, comments)
|
||||
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
def get_videos_user(session, user, csrftoken, detail):
|
||||
"""
|
||||
Scrape videos for given BitChute user
|
||||
|
||||
:param session: HTTP Session to use
|
||||
:param str user: Username to scrape videos for
|
||||
:param str csrftoken: CSRF token to use for requests
|
||||
:param str detail: Detail level to scrape, basic/detail/comments
|
||||
|
||||
:return: Video data dictionaries, as a generator
|
||||
"""
|
||||
max_items = 100
|
||||
num_items = 0
|
||||
offset = 0
|
||||
|
||||
base_url = "https://www.bitchute.com/channel/%s/" % user
|
||||
url = base_url + "extend/"
|
||||
|
||||
container = session.get(base_url)
|
||||
container_soup = BeautifulSoup(container.text, 'html.parser')
|
||||
headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
|
||||
|
||||
while True:
|
||||
|
||||
post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
|
||||
|
||||
try:
|
||||
request = session.post(url, data=post_data, headers=headers)
|
||||
if request.status_code != 200:
|
||||
raise ConnectionError()
|
||||
response = request.json()
|
||||
|
||||
except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
|
||||
raise ValueError('FALSE')
|
||||
soup = BeautifulSoup(response["html"], 'html.parser')
|
||||
videos = soup.select(".channel-videos-container")
|
||||
comments = []
|
||||
|
||||
if len(videos) == 0 or num_items >= max_items:
|
||||
break
|
||||
|
||||
|
||||
for video_element in videos:
|
||||
if num_items >= max_items:
|
||||
break
|
||||
else:
|
||||
num_items += 1
|
||||
|
||||
offset += 1
|
||||
|
||||
link = video_element.select_one(".channel-videos-title a")
|
||||
video = {
|
||||
"id": link["href"].split("/")[-2],
|
||||
"thread_id": link["href"].split("/")[-2],
|
||||
"subject": link.text,
|
||||
"body": strip_tags(video_element.select_one(".channel-videos-text").text),
|
||||
"author": container_soup.select_one(".details .name a").text,
|
||||
"author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
|
||||
"timestamp": int(
|
||||
dateparser.parse(
|
||||
video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
|
||||
"url": "https://www.bitchute.com" + link["href"],
|
||||
"views": video_element.select_one(".video-views").text.strip(),
|
||||
"length": video_element.select_one(".video-duration").text.strip(),
|
||||
"thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
|
||||
}
|
||||
|
||||
if detail != "basic":
|
||||
video, comments = append_details(video, detail)
|
||||
if not video:
|
||||
# unrecoverable error while scraping details
|
||||
return
|
||||
|
||||
yield video
|
||||
for comment in comments:
|
||||
# these need to be yielded *after* the video because else the result file will have the comments
|
||||
# before the video, which is weird
|
||||
yield comment
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
def get_about(user):
|
||||
"""
|
||||
Extract fields from channel's "About" tab
|
||||
"""
|
||||
base_url = "https://www.bitchute.com/channel/%s/" % user
|
||||
|
||||
response = requests.get(base_url)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
about_soup = soup.find('div', {'id' : 'channel-about'})
|
||||
info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p')
|
||||
description_soup = about_soup.find('div', {'id' : 'channel-description'})
|
||||
|
||||
about = {
|
||||
'description' : description_soup.text,
|
||||
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
|
||||
'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
||||
'owner_name' : soup.find('p', {'class' : 'owner'}).text,
|
||||
'category' : info_list[-1].text.split('Category')[1].strip(),
|
||||
'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src']
|
||||
}
|
||||
|
||||
return about
|
||||
@@ -25,13 +25,14 @@ class GettrScraper(cisticola.scraper.Scraper):
|
||||
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
channel=username,
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
channel=username,
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
|
||||
return posts
|
||||
|
||||
|
||||
@@ -23,16 +23,17 @@ class TwitterScraper(cisticola.scraper.Scraper):
|
||||
TwitterScraper.get_username_from_url(channel.url))
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.id <= int(since.platform_id):
|
||||
if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp():
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=tweet.json()))
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=tweet.json()))
|
||||
|
||||
return posts
|
||||
|
||||
|
||||
5
test.py
5
test.py
@@ -4,6 +4,7 @@
|
||||
|
||||
import cisticola
|
||||
import cisticola.scraper.twitter
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
|
||||
@@ -19,6 +20,10 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat
|
||||
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
|
||||
category="qanon", followers=None, platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
|
||||
category="nazi", followers=None, platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user