From 139459e3b25ac4934d9d9562cceb1ed5258cde3c Mon Sep 17 00:00:00 2001
From: Tristan Lee
Date: Fri, 18 Feb 2022 12:45:10 -0600
Subject: [PATCH] implemented Bitchute scraper
---
Pipfile | 3 +
Pipfile.lock | 148 ++++++++++-
cisticola/scraper/bitchute.py | 445 ++++++++++++++++++++++++++++++++++
cisticola/scraper/gettr.py | 15 +-
cisticola/scraper/twitter.py | 17 +-
test.py | 5 +
6 files changed, 614 insertions(+), 19 deletions(-)
create mode 100644 cisticola/scraper/bitchute.py
diff --git a/Pipfile b/Pipfile
index 2e6227d..a1312b1 100644
--- a/Pipfile
+++ b/Pipfile
@@ -8,6 +8,9 @@ sqlalchemy = "*"
snscrape = "*"
loguru = "*"
gogettr = "*"
+requests = "*"
+bs4 = "*"
+dateparser = "*"
[dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
index 2743f87..f0b8511 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb"
+ "sha256": "ca7eea4b95394e06f8b74eac90d376097fd01231010b594cdcc588a3440f1231"
},
"pipfile-spec": 6,
"requires": {
@@ -24,6 +24,13 @@
"markers": "python_version >= '3.1'",
"version": "==4.10.0"
},
+ "bs4": {
+ "hashes": [
+ "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+ ],
+ "index": "pypi",
+ "version": "==0.0.1"
+ },
"certifi": {
"hashes": [
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
@@ -47,6 +54,14 @@
"markers": "python_version >= '3.6'",
"version": "==8.0.3"
},
+ "dateparser": {
+ "hashes": [
+ "sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044",
+ "sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0"
+ ],
+ "index": "pypi",
+ "version": "==1.1.0"
+ },
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -215,15 +230,124 @@
],
"version": "==1.7.1"
},
+ "python-dateutil": {
+ "hashes": [
+ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+ "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==2.8.2"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
+ "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
+ ],
+ "version": "==2021.3"
+ },
+ "pytz-deprecation-shim": {
+ "hashes": [
+ "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+ "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "version": "==0.1.0.post0"
+ },
+ "regex": {
+ "hashes": [
+ "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87",
+ "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52",
+ "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3",
+ "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288",
+ "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f",
+ "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c",
+ "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184",
+ "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f",
+ "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8",
+ "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02",
+ "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3",
+ "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38",
+ "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d",
+ "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633",
+ "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4",
+ "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5",
+ "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202",
+ "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3",
+ "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118",
+ "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d",
+ "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729",
+ "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed",
+ "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607",
+ "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c",
+ "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a",
+ "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75",
+ "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899",
+ "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0",
+ "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832",
+ "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9",
+ "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a",
+ "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6",
+ "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1",
+ "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68",
+ "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e",
+ "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74",
+ "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7",
+ "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3",
+ "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4",
+ "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4",
+ "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b",
+ "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c",
+ "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101",
+ "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a",
+ "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1",
+ "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7",
+ "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d",
+ "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605",
+ "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d",
+ "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916",
+ "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949",
+ "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6",
+ "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3",
+ "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6",
+ "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9",
+ "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af",
+ "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59",
+ "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f",
+ "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2",
+ "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298",
+ "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4",
+ "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c",
+ "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc",
+ "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a",
+ "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43",
+ "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a",
+ "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb",
+ "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093",
+ "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8",
+ "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52",
+ "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442",
+ "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338",
+ "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f",
+ "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"
+ ],
+ "version": "==2022.1.18"
+ },
"requests": {
- "extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
],
- "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+ "index": "pypi",
"version": "==2.27.1"
},
+ "six": {
+ "hashes": [
+ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+ "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+ ],
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+ "version": "==1.16.0"
+ },
"snscrape": {
"hashes": [
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
@@ -282,12 +406,28 @@
"index": "pypi",
"version": "==1.4.31"
},
+ "tzdata": {
+ "hashes": [
+ "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
+ "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==2021.5"
+ },
+ "tzlocal": {
+ "hashes": [
+ "sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09",
+ "sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f"
+ ],
+ "markers": "python_version >= '3.6'",
+ "version": "==4.1"
+ },
"urllib3": {
"hashes": [
"sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
"sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
],
- "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
"version": "==1.26.8"
}
},
diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py
new file mode 100644
index 0000000..7eecb2b
--- /dev/null
+++ b/cisticola/scraper/bitchute.py
@@ -0,0 +1,445 @@
+from datetime import datetime
+import time
+import re
+from html.parser import HTMLParser
+import dateparser
+import json
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+import cisticola.base
+
+class BitchuteScraper(cisticola.scraper.Scraper):
+ """An implementation of a Scraper for Bitchute, using classes from the 4cat
+ library"""
+ __version__ = "BitchuteScraper 0.0.1"
+
+ # TODO snscrape should be able to scrape from user ID alone, but there is
+ # currently a bug/other issue, so it is extracting the username from URL
+ def get_username_from_url(url):
+ username = url.split('bitchute.com/channel/')[-1].strip('/')
+
+ return username
+
+ def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
+
+ session = requests.Session()
+ session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
+ request = session.get("https://www.bitchute.com/search")
+ csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
+ "input", {"name": "csrfmiddlewaretoken"})[0].get("value")
+ time.sleep(0.25)
+
+ # Don't scrape comment information
+ #TODO implement framework for processing and storing comments
+ detail = 'basic'
+
+ posts = []
+ username = BitchuteScraper.get_username_from_url(channel.url)
+ scraper = get_videos_user(session, username, csrftoken, detail)
+
+ for i, post in enumerate(scraper):
+
+ if since is not None and post['timestamp'] <= since.date_archived.timestamp():
+ print( f'\n\nBREAK ON VIDEO: {i}\n\n')
+ break
+
+ posts.append(cisticola.base.ScraperResult(
+ scraper=self.__version__,
+ platform="Bitchute",
+ channel=channel.id,
+ platform_id=post['id'],
+ date=datetime.fromtimestamp(post['timestamp']),
+ date_archived=datetime.now(),
+ raw_data=json.dumps(post)))
+
+ return posts
+
+ def can_handle(self, channel):
+ if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
+ return True
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def strip_tags(html, convert_newlines=True):
+ """
+ Strip HTML from a string
+
+ :param html: HTML to strip
+ :param convert_newlines: Convert
and
tags to \n before stripping
+ :return: Stripped HTML
+ """
+ if not html:
+ return ""
+
+ deduplicate_newlines = re.compile(r"\n+")
+
+ if convert_newlines:
+ html = html.replace("
", "\n").replace("", "\n")
+ html = deduplicate_newlines.sub("\n", html)
+
+ class HTMLStripper(HTMLParser):
+ def __init__(self):
+ super().__init__()
+ self.reset()
+ self.strict = False
+ self.convert_charrefs = True
+ self.fed = []
+
+ def handle_data(self, data):
+ self.fed.append(data)
+
+ def get_data(self):
+ return "".join(self.fed)
+
+ stripper = HTMLStripper()
+ stripper.feed(html)
+ return stripper.get_data()
+
+#-----------------------------------------------------------------------------#
+
+def request_from_bitchute(session, method, url, headers=None, data=None):
+ """
+ Request something via the BitChute API (or non-API)
+
+ To avoid having to write the same error-checking everywhere, this takes
+ care of retrying on failure, et cetera
+
+ :param session: Requests session
+ :param str method: GET or POST
+ :param str url: URL to fetch
+ :param dict header: Headers to pass with the request
+ :param dict data: Data/params to send with the request
+
+ :return: Requests response
+ """
+ retries = 0
+ response = None
+ while retries < 3:
+ try:
+ if method.lower() == "post":
+ request = session.post(url, headers=headers, data=data)
+ elif method.lower() == "get":
+ request = session.get(url, headers=headers, params=data)
+ else:
+ raise NotImplemented()
+
+ if request.status_code >= 300:
+ raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url))
+
+ response = request.json()
+ return response
+
+ except (ConnectionResetError, requests.RequestException, ValueError) as e:
+ retries += 1
+ time.sleep(retries * 2)
+
+ except json.JSONDecodeError as e:
+ raise RuntimeError()
+
+ if not response:
+ raise RuntimeError()
+
+ return response
+
+#-----------------------------------------------------------------------------#
+
+def append_details(video, detail):
+ """
+ Append extra metadata to video data
+
+ Fetches the BitChute video detail page to scrape extra data for the given video.
+
+ :param dict video: Video details as scraped so far
+ :param str detail: Detail level. If 'comments', also scrape video comments.
+
+ :return dict: Tuple, first item: updated video data, second: list of comments
+ """
+ comments = []
+
+ video = {
+ **video,
+ "likes": "",
+ "dislikes": "",
+ "channel_subscribers": "",
+ "comments": "",
+ "hashtags": "",
+ "parent_id": "",
+ "video_url": ""
+ }
+
+ try:
+ # to get more details per video, we need to request the actual video detail page
+ # start a new session, to not interfere with the CSRF token from the search session
+ video_session = requests.session()
+ video_page = video_session.get(video["url"])
+
+ if "Video Restricted
" in video_page.text or \
+ "Video Blocked
" in video_page.text or \
+ "Channel Blocked
" in video_page.text or \
+ "Channel Restricted
" in video_page.text:
+ if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
+ video["category"] = "moderated-illegal"
+ return (video, [])
+
+ elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
+ video["category"] = "moderated-nsfl"
+ return (video, [])
+
+ elif "Contains Incitement to Hatred" in video_page.text:
+ video["category"] = "moderated-incitement"
+ return (video, [])
+
+ elif "Platform Misuse" in video_page.text:
+ video["category"] = "moderated-misuse"
+ return (video, [])
+
+ elif "Terrorism & Violent Extremism" in video_page.text:
+ video["category"] = "moderated-terrorism-extremism"
+ return (video, [])
+
+ elif "Copyright" in video_page.text:
+ video["category"] = "moderated-copyright"
+ return (video, [])
+
+ else:
+ video["category"] = "moderated-other"
+ return (video, [])
+
+ elif "