diff --git a/Pipfile b/Pipfile index 2e6227d..a1312b1 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,9 @@ sqlalchemy = "*" snscrape = "*" loguru = "*" gogettr = "*" +requests = "*" +bs4 = "*" +dateparser = "*" [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index 2743f87..f0b8511 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb" + "sha256": "ca7eea4b95394e06f8b74eac90d376097fd01231010b594cdcc588a3440f1231" }, "pipfile-spec": 6, "requires": { @@ -24,6 +24,13 @@ "markers": "python_version >= '3.1'", "version": "==4.10.0" }, + "bs4": { + "hashes": [ + "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" + ], + "index": "pypi", + "version": "==0.0.1" + }, "certifi": { "hashes": [ "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", @@ -47,6 +54,14 @@ "markers": "python_version >= '3.6'", "version": "==8.0.3" }, + "dateparser": { + "hashes": [ + "sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044", + "sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0" + ], + "index": "pypi", + "version": "==1.1.0" + }, "filelock": { "hashes": [ "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85", @@ -215,15 +230,124 @@ ], "version": "==1.7.1" }, + "python-dateutil": { + "hashes": [ + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.2" + }, + "pytz": { + "hashes": [ + "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", + "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + ], + "version": "==2021.3" + }, + "pytz-deprecation-shim": { + "hashes": [ + "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6", + "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==0.1.0.post0" + }, + "regex": { + "hashes": [ + "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87", + "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52", + "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3", + "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288", + "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f", + "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c", + "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184", + "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f", + "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8", + "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02", + "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3", + "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38", + "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d", + "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633", + "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4", + "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5", + "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202", + "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3", + "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118", + "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d", + "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729", + "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed", + "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607", + "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c", + "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a", + "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75", + "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899", + "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0", + "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832", + "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9", + "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a", + "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6", + "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1", + "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68", + "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e", + "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74", + "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7", + "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3", + "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4", + "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4", + "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b", + "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c", + "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101", + "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a", + "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1", + "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7", + "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d", + "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605", + "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d", + "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916", + "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949", + "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6", + "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3", + "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6", + "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9", + "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af", + "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59", + "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f", + "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2", + "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298", + "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4", + "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c", + "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc", + "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a", + "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43", + "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a", + "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb", + "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093", + "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8", + "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52", + "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442", + "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338", + "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f", + "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab" + ], + "version": "==2022.1.18" + }, "requests": { - "extras": [], "hashes": [ "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61", "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "index": "pypi", "version": "==2.27.1" }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, "snscrape": { "hashes": [ "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3", @@ -282,12 +406,28 @@ "index": "pypi", "version": "==1.4.31" }, + "tzdata": { + "hashes": [ + "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5", + "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21" + ], + "markers": "python_version >= '3.6'", + "version": "==2021.5" + }, + "tzlocal": { + "hashes": [ + "sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09", + "sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f" + ], + "markers": "python_version >= '3.6'", + "version": "==4.1" + }, "urllib3": { "hashes": [ "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed", "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'", "version": "==1.26.8" } }, diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py new file mode 100644 index 0000000..7eecb2b --- /dev/null +++ b/cisticola/scraper/bitchute.py @@ -0,0 +1,445 @@ +from datetime import datetime +import time +import re +from html.parser import HTMLParser +import dateparser +import json +from typing import List + +import requests +from bs4 import BeautifulSoup + +import cisticola.base + +class BitchuteScraper(cisticola.scraper.Scraper): + """An implementation of a Scraper for Bitchute, using classes from the 4cat + library""" + __version__ = "BitchuteScraper 0.0.1" + + # TODO snscrape should be able to scrape from user ID alone, but there is + # currently a bug/other issue, so it is extracting the username from URL + def get_username_from_url(url): + username = url.split('bitchute.com/channel/')[-1].strip('/') + + return username + + def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]: + + session = requests.Session() + session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0" + request = session.get("https://www.bitchute.com/search") + csrftoken = BeautifulSoup(request.text, 'html.parser').findAll( + "input", {"name": "csrfmiddlewaretoken"})[0].get("value") + time.sleep(0.25) + + # Don't scrape comment information + #TODO implement framework for processing and storing comments + detail = 'basic' + + posts = [] + username = BitchuteScraper.get_username_from_url(channel.url) + scraper = get_videos_user(session, username, csrftoken, detail) + + for i, post in enumerate(scraper): + + if since is not None and post['timestamp'] <= since.date_archived.timestamp(): + print( f'\n\nBREAK ON VIDEO: {i}\n\n') + break + + posts.append(cisticola.base.ScraperResult( + scraper=self.__version__, + platform="Bitchute", + channel=channel.id, + platform_id=post['id'], + date=datetime.fromtimestamp(post['timestamp']), + date_archived=datetime.now(), + raw_data=json.dumps(post))) + + return posts + + def can_handle(self, channel): + if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None: + return True + +#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# + +def strip_tags(html, convert_newlines=True): + """ + Strip HTML from a string + + :param html: HTML to strip + :param convert_newlines: Convert
and

tags to \n before stripping + :return: Stripped HTML + """ + if not html: + return "" + + deduplicate_newlines = re.compile(r"\n+") + + if convert_newlines: + html = html.replace("
", "\n").replace("

", "

\n") + html = deduplicate_newlines.sub("\n", html) + + class HTMLStripper(HTMLParser): + def __init__(self): + super().__init__() + self.reset() + self.strict = False + self.convert_charrefs = True + self.fed = [] + + def handle_data(self, data): + self.fed.append(data) + + def get_data(self): + return "".join(self.fed) + + stripper = HTMLStripper() + stripper.feed(html) + return stripper.get_data() + +#-----------------------------------------------------------------------------# + +def request_from_bitchute(session, method, url, headers=None, data=None): + """ + Request something via the BitChute API (or non-API) + + To avoid having to write the same error-checking everywhere, this takes + care of retrying on failure, et cetera + + :param session: Requests session + :param str method: GET or POST + :param str url: URL to fetch + :param dict header: Headers to pass with the request + :param dict data: Data/params to send with the request + + :return: Requests response + """ + retries = 0 + response = None + while retries < 3: + try: + if method.lower() == "post": + request = session.post(url, headers=headers, data=data) + elif method.lower() == "get": + request = session.get(url, headers=headers, params=data) + else: + raise NotImplemented() + + if request.status_code >= 300: + raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url)) + + response = request.json() + return response + + except (ConnectionResetError, requests.RequestException, ValueError) as e: + retries += 1 + time.sleep(retries * 2) + + except json.JSONDecodeError as e: + raise RuntimeError() + + if not response: + raise RuntimeError() + + return response + +#-----------------------------------------------------------------------------# + +def append_details(video, detail): + """ + Append extra metadata to video data + + Fetches the BitChute video detail page to scrape extra data for the given video. + + :param dict video: Video details as scraped so far + :param str detail: Detail level. If 'comments', also scrape video comments. + + :return dict: Tuple, first item: updated video data, second: list of comments + """ + comments = [] + + video = { + **video, + "likes": "", + "dislikes": "", + "channel_subscribers": "", + "comments": "", + "hashtags": "", + "parent_id": "", + "video_url": "" + } + + try: + # to get more details per video, we need to request the actual video detail page + # start a new session, to not interfere with the CSRF token from the search session + video_session = requests.session() + video_page = video_session.get(video["url"]) + + if "

Video Restricted

" in video_page.text or \ + "

Video Blocked

" in video_page.text or \ + "

Channel Blocked

" in video_page.text or \ + "

Channel Restricted

" in video_page.text: + if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text: + video["category"] = "moderated-illegal" + return (video, []) + + elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text: + video["category"] = "moderated-nsfl" + return (video, []) + + elif "Contains Incitement to Hatred" in video_page.text: + video["category"] = "moderated-incitement" + return (video, []) + + elif "Platform Misuse" in video_page.text: + video["category"] = "moderated-misuse" + return (video, []) + + elif "Terrorism & Violent Extremism" in video_page.text: + video["category"] = "moderated-terrorism-extremism" + return (video, []) + + elif "Copyright" in video_page.text: + video["category"] = "moderated-copyright" + return (video, []) + + else: + video["category"] = "moderated-other" + return (video, []) + + elif "