implemented Bitchute scraper

2026-06-08 03:18:34 +03:00 · 2022-02-18 12:45:10 -06:00
parent 4668d4df11
commit 139459e3b2
6 changed files with 614 additions and 19 deletions
--- a/3
+++ b/3
@@ -8,6 +8,9 @@ sqlalchemy = "*"
 snscrape = "*"
 loguru = "*"
 gogettr = "*"
+requests = "*"
+bs4 = "*"
+dateparser = "*"

 [dev-packages]

--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "e335358892de4b581de211099e214f370f8cfd1f86b2cd2b3f0ea6d2d43313bb"
+            "sha256": "ca7eea4b95394e06f8b74eac90d376097fd01231010b594cdcc588a3440f1231"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -24,6 +24,13 @@
            "markers": "python_version >= '3.1'",
            "version": "==4.10.0"
        },
+        "bs4": {
+            "hashes": [
+                "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
+            ],
+            "index": "pypi",
+            "version": "==0.0.1"
+        },
        "certifi": {
            "hashes": [
                "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
@@ -47,6 +54,14 @@
            "markers": "python_version >= '3.6'",
            "version": "==8.0.3"
        },
+        "dateparser": {
+            "hashes": [
+                "sha256:faa2b97f51f3b5ff1ba2f17be90de2b733fb6191f89b4058787473e8202f3044",
+                "sha256:fec344db1f73d005182e214c0ff27313c748bbe0c1638ce9d48a809ddfdab2a0"
+            ],
+            "index": "pypi",
+            "version": "==1.1.0"
+        },
        "filelock": {
            "hashes": [
                "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -215,15 +230,124 @@
            ],
            "version": "==1.7.1"
        },
+        "python-dateutil": {
+            "hashes": [
+                "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
+                "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.8.2"
+        },
+        "pytz": {
+            "hashes": [
+                "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c",
+                "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"
+            ],
+            "version": "==2021.3"
+        },
+        "pytz-deprecation-shim": {
+            "hashes": [
+                "sha256:8314c9692a636c8eb3bda879b9f119e350e93223ae83e70e80c31675a0fdc1a6",
+                "sha256:af097bae1b616dde5c5744441e2ddc69e74dfdcb0c263129610d85b87445a59d"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "version": "==0.1.0.post0"
+        },
+        "regex": {
+            "hashes": [
+                "sha256:04611cc0f627fc4a50bc4a9a2e6178a974c6a6a4aa9c1cca921635d2c47b9c87",
+                "sha256:0b5d6f9aed3153487252d00a18e53f19b7f52a1651bc1d0c4b5844bc286dfa52",
+                "sha256:0d2f5c3f7057530afd7b739ed42eb04f1011203bc5e4663e1e1d01bb50f813e3",
+                "sha256:11772be1eb1748e0e197a40ffb82fb8fd0d6914cd147d841d9703e2bef24d288",
+                "sha256:1333b3ce73269f986b1fa4d5d395643810074dc2de5b9d262eb258daf37dc98f",
+                "sha256:16f81025bb3556eccb0681d7946e2b35ff254f9f888cff7d2120e8826330315c",
+                "sha256:1a171eaac36a08964d023eeff740b18a415f79aeb212169080c170ec42dd5184",
+                "sha256:1d6301f5288e9bdca65fab3de6b7de17362c5016d6bf8ee4ba4cbe833b2eda0f",
+                "sha256:1e031899cb2bc92c0cf4d45389eff5b078d1936860a1be3aa8c94fa25fb46ed8",
+                "sha256:1f8c0ae0a0de4e19fddaaff036f508db175f6f03db318c80bbc239a1def62d02",
+                "sha256:2245441445099411b528379dee83e56eadf449db924648e5feb9b747473f42e3",
+                "sha256:22709d701e7037e64dae2a04855021b62efd64a66c3ceed99dfd684bfef09e38",
+                "sha256:24c89346734a4e4d60ecf9b27cac4c1fee3431a413f7aa00be7c4d7bbacc2c4d",
+                "sha256:25716aa70a0d153cd844fe861d4f3315a6ccafce22b39d8aadbf7fcadff2b633",
+                "sha256:2dacb3dae6b8cc579637a7b72f008bff50a94cde5e36e432352f4ca57b9e54c4",
+                "sha256:34316bf693b1d2d29c087ee7e4bb10cdfa39da5f9c50fa15b07489b4ab93a1b5",
+                "sha256:36b2d700a27e168fa96272b42d28c7ac3ff72030c67b32f37c05616ebd22a202",
+                "sha256:37978254d9d00cda01acc1997513f786b6b971e57b778fbe7c20e30ae81a97f3",
+                "sha256:38289f1690a7e27aacd049e420769b996826f3728756859420eeee21cc857118",
+                "sha256:385ccf6d011b97768a640e9d4de25412204fbe8d6b9ae39ff115d4ff03f6fe5d",
+                "sha256:3c7ea86b9ca83e30fa4d4cd0eaf01db3ebcc7b2726a25990966627e39577d729",
+                "sha256:49810f907dfe6de8da5da7d2b238d343e6add62f01a15d03e2195afc180059ed",
+                "sha256:519c0b3a6fbb68afaa0febf0d28f6c4b0a1074aefc484802ecb9709faf181607",
+                "sha256:51f02ca184518702975b56affde6c573ebad4e411599005ce4468b1014b4786c",
+                "sha256:552a39987ac6655dad4bf6f17dd2b55c7b0c6e949d933b8846d2e312ee80005a",
+                "sha256:596f5ae2eeddb79b595583c2e0285312b2783b0ec759930c272dbf02f851ff75",
+                "sha256:6014038f52b4b2ac1fa41a58d439a8a00f015b5c0735a0cd4b09afe344c94899",
+                "sha256:61ebbcd208d78658b09e19c78920f1ad38936a0aa0f9c459c46c197d11c580a0",
+                "sha256:6213713ac743b190ecbf3f316d6e41d099e774812d470422b3a0f137ea635832",
+                "sha256:637e27ea1ebe4a561db75a880ac659ff439dec7f55588212e71700bb1ddd5af9",
+                "sha256:6aa427c55a0abec450bca10b64446331b5ca8f79b648531138f357569705bc4a",
+                "sha256:6ca45359d7a21644793de0e29de497ef7f1ae7268e346c4faf87b421fea364e6",
+                "sha256:6db1b52c6f2c04fafc8da17ea506608e6be7086715dab498570c3e55e4f8fbd1",
+                "sha256:752e7ddfb743344d447367baa85bccd3629c2c3940f70506eb5f01abce98ee68",
+                "sha256:760c54ad1b8a9b81951030a7e8e7c3ec0964c1cb9fee585a03ff53d9e531bb8e",
+                "sha256:768632fd8172ae03852e3245f11c8a425d95f65ff444ce46b3e673ae5b057b74",
+                "sha256:7a0b9f6a1a15d494b35f25ed07abda03209fa76c33564c09c9e81d34f4b919d7",
+                "sha256:7e070d3aef50ac3856f2ef5ec7214798453da878bb5e5a16c16a61edf1817cc3",
+                "sha256:7e12949e5071c20ec49ef00c75121ed2b076972132fc1913ddf5f76cae8d10b4",
+                "sha256:7e26eac9e52e8ce86f915fd33380f1b6896a2b51994e40bb094841e5003429b4",
+                "sha256:85ffd6b1cb0dfb037ede50ff3bef80d9bf7fa60515d192403af6745524524f3b",
+                "sha256:8618d9213a863c468a865e9d2ec50221015f7abf52221bc927152ef26c484b4c",
+                "sha256:8acef4d8a4353f6678fd1035422a937c2170de58a2b29f7da045d5249e934101",
+                "sha256:8d2f355a951f60f0843f2368b39970e4667517e54e86b1508e76f92b44811a8a",
+                "sha256:90b6840b6448203228a9d8464a7a0d99aa8fa9f027ef95fe230579abaf8a6ee1",
+                "sha256:9187500d83fd0cef4669385cbb0961e227a41c0c9bc39219044e35810793edf7",
+                "sha256:93c20777a72cae8620203ac11c4010365706062aa13aaedd1a21bb07adbb9d5d",
+                "sha256:93cce7d422a0093cfb3606beae38a8e47a25232eea0f292c878af580a9dc7605",
+                "sha256:94c623c331a48a5ccc7d25271399aff29729fa202c737ae3b4b28b89d2b0976d",
+                "sha256:97f32dc03a8054a4c4a5ab5d761ed4861e828b2c200febd4e46857069a483916",
+                "sha256:9a2bf98ac92f58777c0fafc772bf0493e67fcf677302e0c0a630ee517a43b949",
+                "sha256:a602bdc8607c99eb5b391592d58c92618dcd1537fdd87df1813f03fed49957a6",
+                "sha256:a9d24b03daf7415f78abc2d25a208f234e2c585e5e6f92f0204d2ab7b9ab48e3",
+                "sha256:abfcb0ef78df0ee9df4ea81f03beea41849340ce33a4c4bd4dbb99e23ec781b6",
+                "sha256:b013f759cd69cb0a62de954d6d2096d648bc210034b79b1881406b07ed0a83f9",
+                "sha256:b02e3e72665cd02afafb933453b0c9f6c59ff6e3708bd28d0d8580450e7e88af",
+                "sha256:b52cc45e71657bc4743a5606d9023459de929b2a198d545868e11898ba1c3f59",
+                "sha256:ba37f11e1d020969e8a779c06b4af866ffb6b854d7229db63c5fdddfceaa917f",
+                "sha256:bb804c7d0bfbd7e3f33924ff49757de9106c44e27979e2492819c16972ec0da2",
+                "sha256:bf594cc7cc9d528338d66674c10a5b25e3cde7dd75c3e96784df8f371d77a298",
+                "sha256:c38baee6bdb7fe1b110b6b3aaa555e6e872d322206b7245aa39572d3fc991ee4",
+                "sha256:c73d2166e4b210b73d1429c4f1ca97cea9cc090e5302df2a7a0a96ce55373f1c",
+                "sha256:c9099bf89078675c372339011ccfc9ec310310bf6c292b413c013eb90ffdcafc",
+                "sha256:cf0db26a1f76aa6b3aa314a74b8facd586b7a5457d05b64f8082a62c9c49582a",
+                "sha256:d19a34f8a3429bd536996ad53597b805c10352a8561d8382e05830df389d2b43",
+                "sha256:da80047524eac2acf7c04c18ac7a7da05a9136241f642dd2ed94269ef0d0a45a",
+                "sha256:de2923886b5d3214be951bc2ce3f6b8ac0d6dfd4a0d0e2a4d2e5523d8046fdfb",
+                "sha256:defa0652696ff0ba48c8aff5a1fac1eef1ca6ac9c660b047fc8e7623c4eb5093",
+                "sha256:e54a1eb9fd38f2779e973d2f8958fd575b532fe26013405d1afb9ee2374e7ab8",
+                "sha256:e5c31d70a478b0ca22a9d2d76d520ae996214019d39ed7dd93af872c7f301e52",
+                "sha256:ebaeb93f90c0903233b11ce913a7cb8f6ee069158406e056f884854c737d2442",
+                "sha256:ecfe51abf7f045e0b9cdde71ca9e153d11238679ef7b5da6c82093874adf3338",
+                "sha256:f99112aed4fb7cee00c7f77e8b964a9b10f69488cdff626ffd797d02e2e4484f",
+                "sha256:fd914db437ec25bfa410f8aa0aa2f3ba87cdfc04d9919d608d02330947afaeab"
+            ],
+            "version": "==2022.1.18"
+        },
        "requests": {
-            "extras": [],
            "hashes": [
                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
                "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
+            "index": "pypi",
            "version": "==2.27.1"
        },
+        "six": {
+            "hashes": [
+                "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
+                "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==1.16.0"
+        },
        "snscrape": {
            "hashes": [
                "sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
@@ -282,12 +406,28 @@
            "index": "pypi",
            "version": "==1.4.31"
        },
+        "tzdata": {
+            "hashes": [
+                "sha256:3eee491e22ebfe1e5cfcc97a4137cd70f092ce59144d81f8924a844de05ba8f5",
+                "sha256:68dbe41afd01b867894bbdfd54fa03f468cfa4f0086bfb4adcd8de8f24f3ee21"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2021.5"
+        },
+        "tzlocal": {
+            "hashes": [
+                "sha256:0f28015ac68a5c067210400a9197fc5d36ba9bc3f8eaf1da3cbd59acdfed9e09",
+                "sha256:28ba8d9fcb6c9a782d6e0078b4f6627af1ea26aeaa32b4eab5324abc7df4149f"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==4.1"
+        },
        "urllib3": {
            "hashes": [
                "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed",
                "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0'",
            "version": "==1.26.8"
        }
    },
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -0,0 +1,445 @@
+from datetime import datetime
+import time
+import re 
+from html.parser import HTMLParser
+import dateparser
+import json
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+import cisticola.base
+
+class BitchuteScraper(cisticola.scraper.Scraper):
+    """An implementation of a Scraper for Bitchute, using classes from the 4cat
+    library"""
+    __version__ = "BitchuteScraper 0.0.1"
+
+    # TODO snscrape should be able to scrape from user ID alone, but there is
+    # currently a bug/other issue, so it is extracting the username from URL
+    def get_username_from_url(url):
+        username = url.split('bitchute.com/channel/')[-1].strip('/')
+
+        return username
+
+    def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
+
+        session = requests.Session()
+        session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
+        request = session.get("https://www.bitchute.com/search")
+        csrftoken = BeautifulSoup(request.text, 'html.parser').findAll(
+            "input", {"name": "csrfmiddlewaretoken"})[0].get("value")
+        time.sleep(0.25)
+
+        # Don't scrape comment information 
+        #TODO implement framework for processing and storing comments
+        detail = 'basic'
+
+        posts = []
+        username = BitchuteScraper.get_username_from_url(channel.url)
+        scraper = get_videos_user(session, username, csrftoken, detail)
+
+        for i, post in enumerate(scraper):
+
+            if since is not None and post['timestamp'] <= since.date_archived.timestamp():
+                print( f'\n\nBREAK ON VIDEO: {i}\n\n')
+                break
+
+            posts.append(cisticola.base.ScraperResult(
+                scraper=self.__version__,
+                platform="Bitchute",
+                channel=channel.id,
+                platform_id=post['id'],
+                date=datetime.fromtimestamp(post['timestamp']),
+                date_archived=datetime.now(),
+                raw_data=json.dumps(post)))
+
+        return posts
+
+    def can_handle(self, channel):
+        if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
+            return True
+
+#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
+
+def strip_tags(html, convert_newlines=True):
+    """
+    Strip HTML from a string
+
+    :param html: HTML to strip
+    :param convert_newlines: Convert <br> and </p> tags to \n before stripping
+    :return: Stripped HTML
+    """
+    if not html:
+        return ""
+
+    deduplicate_newlines = re.compile(r"\n+")
+
+    if convert_newlines:
+        html = html.replace("<br>", "\n").replace("</p>", "</p>\n")
+        html = deduplicate_newlines.sub("\n", html)
+
+    class HTMLStripper(HTMLParser):
+        def __init__(self):
+            super().__init__()
+            self.reset()
+            self.strict = False
+            self.convert_charrefs = True
+            self.fed = []
+
+        def handle_data(self, data):
+            self.fed.append(data)
+
+        def get_data(self):
+            return "".join(self.fed)
+
+    stripper = HTMLStripper()
+    stripper.feed(html)
+    return stripper.get_data()
+
+#-----------------------------------------------------------------------------#
+
+def request_from_bitchute(session, method, url, headers=None, data=None):
+    """
+    Request something via the BitChute API (or non-API)
+
+    To avoid having to write the same error-checking everywhere, this takes
+    care of retrying on failure, et cetera
+
+    :param session:  Requests session
+    :param str method: GET or POST
+    :param str url:  URL to fetch
+    :param dict header:  Headers to pass with the request
+    :param dict data:  Data/params to send with the request
+
+    :return:  Requests response
+    """
+    retries = 0
+    response = None
+    while retries < 3:
+        try:
+            if method.lower() == "post":
+                request = session.post(url, headers=headers, data=data)
+            elif method.lower() == "get":
+                request = session.get(url, headers=headers, params=data)
+            else:
+                raise NotImplemented()
+
+            if request.status_code >= 300:
+                raise ValueError("Response %i from BitChut for URL %s, need to retry" % (request.status_code, url))
+
+            response = request.json()
+            return response
+
+        except (ConnectionResetError, requests.RequestException, ValueError) as e:
+            retries += 1
+            time.sleep(retries * 2)
+
+        except json.JSONDecodeError as e:
+            raise RuntimeError()
+
+    if not response:
+        raise RuntimeError()
+
+    return response
+
+#-----------------------------------------------------------------------------#
+
+def append_details(video, detail):
+    """
+    Append extra metadata to video data
+
+    Fetches the BitChute video detail page to scrape extra data for the given video.
+
+    :param dict video:  Video details as scraped so far
+    :param str detail:  Detail level. If 'comments', also scrape video comments.
+
+    :return dict:  Tuple, first item: updated video data, second: list of comments
+    """
+    comments = []
+
+    video = {
+        **video,
+        "likes": "",
+        "dislikes": "",
+        "channel_subscribers": "",
+        "comments": "",
+        "hashtags": "",
+        "parent_id": "",
+        "video_url": ""
+    }
+
+    try:
+        # to get more details per video, we need to request the actual video detail page
+        # start a new session, to not interfere with the CSRF token from the search session
+        video_session = requests.session()
+        video_page = video_session.get(video["url"])
+
+        if "<h1 class=\"page-title\">Video Restricted</h1>" in video_page.text or \
+                "<h1 class=\"page-title\">Video Blocked</h1>" in video_page.text or \
+                "<h1 class=\"page-title\">Channel Blocked</h1>" in video_page.text or \
+                "<h1 class=\"page-title\">Channel Restricted</h1>" in video_page.text:
+            if "This video is unavailable as the contents have been deemed potentially illegal" in video_page.text:
+                video["category"] = "moderated-illegal"
+                return (video, [])
+
+            elif "Viewing of this video is restricted, as it has been marked as Not Safe For Life" in video_page.text:
+                video["category"] = "moderated-nsfl"
+                return (video, [])
+
+            elif "Contains Incitement to Hatred" in video_page.text:
+                video["category"] = "moderated-incitement"
+                return (video, [])
+
+            elif "Platform Misuse" in video_page.text:
+                video["category"] = "moderated-misuse"
+                return (video, [])
+
+            elif "Terrorism &amp; Violent Extremism" in video_page.text:
+                video["category"] = "moderated-terrorism-extremism"
+                return (video, [])
+
+            elif "Copyright</h4>" in video_page.text:
+                video["category"] = "moderated-copyright"
+                return (video, [])
+
+            else:
+                video["category"] = "moderated-other"
+                return (video, [])
+
+        elif "<iframe class=\"rumble\"" in video_page.text:
+            # some videos are actually embeds from rumble?
+            # these are iframes, so at the moment we cannot simply extract
+            # their info from the page, so we skip them. In the future we
+            # could add an extra request to get the relevant info, but so
+            # far the only examples I've seen are actually 'video not found'
+            video = {
+                **video,
+                "category": "error-embed-from-rumble"
+            }
+            return (video, [])
+
+        elif video_page.status_code != 200:
+            video = {
+                **video,
+                "category": "error-%i" % video_page.status_code
+            }
+            return (video, [])
+
+        soup = BeautifulSoup(video_page.text, 'html.parser')
+        video_csfrtoken = soup.findAll("input", {"name": "csrfmiddlewaretoken"})[0].get("value")
+
+        video["video_url"] = soup.select_one("video#player source").get("src")
+        video["thumbnail_image"] = soup.select_one("video#player").get("poster")
+        video["subject"] = soup.select_one("h1#video-title").text
+        video["author"] = soup.select_one("div.channel-banner p.name a").text
+        video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
+        video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
+
+        # we need *two more requests* to get the comment count and like/dislike counts
+        # this seems to be because bitchute uses a third-party comment widget
+        video_session.headers = {'Referer': video["url"], 'Origin': video["url"]}
+        counts = request_from_bitchute(video_session, "POST", "https://www.bitchute.com/video/%s/counts/" % video["id"], data={"csrfmiddlewaretoken": video_csfrtoken})
+
+        if detail == "comments":
+            # if comments are also to be scraped, this is anothe request to make, which returns
+            # a convenient JSON response with all the comments to the video
+            # we need yet another token for this, which we can extract from a bit of inline
+            # javascript on the page
+            comment_script = None
+            for line in video_page.text.split("\n"):
+                if "initComments(" in line:
+                    comment_script = line.split("initComments(")[1]
+                    break
+
+            if not comment_script:
+                # no script to extract comments from, cannot load
+                comment_count = -1
+            else:
+                # make the request
+                comment_count = 0
+                url = comment_script.split("'")[1]
+                comment_csrf = comment_script.split("'")[3]
+                comments_data = request_from_bitchute(video_session, "POST", url + "/api/get_comments/", data={"cf_auth": comment_csrf, "commentCount": 0})
+
+                for comment in comments_data:
+                    comment_count += 1
+
+                    if comment.get("profile_picture_url", None):
+                        thumbnail_image = url + comment.get("profile_picture_url")
+                    else:
+                        thumbnail_image = ""
+
+                    comments.append({
+                        "id": comment["id"],
+                        "thread_id": video["id"],
+                        "subject": "",
+                        "body": comment["content"],
+                        "author": comment["fullname"],
+                        "author_id": comment["creator"],
+                        "timestamp": int(dateparser.parse(comment["created"]).timestamp()),
+                        "url": "",
+                        "views": "",
+                        "length": "",
+                        "hashtags": "",
+                        "thumbnail_image": thumbnail_image,
+                        "likes": comment["upvote_count"],
+                        "category": "comment",
+                        "dislikes": "",
+                        "channel_subscribers": "",
+                        "comments": "",
+                        "parent_id": comment.get("parent", "") if "parent" in comment else video["id"],
+                    })
+
+        else:
+            # if we don't need the full comments, we still need another request to get the *amount*
+            # of comments,
+            comment_count = request_from_bitchute(video_session, "POST",
+                "https://commentfreely.bitchute.com/api/get_comment_count/",
+                data={"csrfmiddlewaretoken": video_csfrtoken,
+                      "cf_thread": "bc_" + video["id"]})["commentCount"]
+
+    except RuntimeError as e:
+        # we wrap this in one big try-catch because doing it for each request separarely is tedious
+        # hm... maybe this should be in a helper function
+#         self.dataset.update_status("Error while interacting with BitChute (%s) - try again later." % e,
+#                                    is_final=True)
+        return (None, None)
+
+    # again, no structured info available for the publication date, but at least we can extract the
+    # exact day it was uploaded
+    try:
+        published = dateparser.parse(
+            soup.find(class_="video-publish-date").text.split("published at")[1].strip()[:-1])
+    except AttributeError as e:
+        # publication date not on page?
+        published = None
+
+    # merge data
+    video = {
+        **video,
+        "category": re.findall(r'<td><a href="/category/([^/]+)/"', video_page.text)[0],
+        "likes": counts["like_count"],
+        "dislikes": counts["dislike_count"],
+        "channel_subscribers": counts["subscriber_count"],
+        "comments": comment_count,
+        "parent_id": "",
+        "hashtags": ",".join([tag.text for tag in soup.select("#video-hashtags li a")]),
+        "views": counts["view_count"]
+    }
+
+    if published:
+        video["timestamp"] = int(published.timestamp())
+
+    # may need to be increased? bitchute doesn't seem particularly strict
+    time.sleep(0.25)
+    return (video, comments)
+
+#-----------------------------------------------------------------------------#
+
+def get_videos_user(session, user, csrftoken, detail):
+    """
+    Scrape videos for given BitChute user
+
+    :param session:  HTTP Session to use
+    :param str user:  Username to scrape videos for
+    :param str csrftoken:  CSRF token to use for requests
+    :param str detail:  Detail level to scrape, basic/detail/comments
+
+    :return:  Video data dictionaries, as a generator
+    """
+    max_items = 100
+    num_items = 0
+    offset = 0
+    
+    base_url = "https://www.bitchute.com/channel/%s/" % user
+    url = base_url + "extend/"
+
+    container = session.get(base_url)
+    container_soup = BeautifulSoup(container.text, 'html.parser')
+    headers = {'Referer': base_url, 'Origin': "https://www.bitchute.com/"}
+
+    while True:
+
+        post_data = {"csrfmiddlewaretoken": csrftoken, "name": "", "offset": str(offset)}
+
+        try:
+            request = session.post(url, data=post_data, headers=headers)
+            if request.status_code != 200:
+                raise ConnectionError()
+            response = request.json()
+
+        except (json.JSONDecodeError, requests.RequestException, ConnectionError) as e:
+            raise ValueError('FALSE')
+        soup = BeautifulSoup(response["html"], 'html.parser')
+        videos = soup.select(".channel-videos-container")
+        comments = []
+
+        if len(videos) == 0 or num_items >= max_items:
+            break
+            
+
+        for video_element in videos:
+            if num_items >= max_items:
+                break
+            else:
+                num_items += 1
+
+            offset += 1
+
+            link = video_element.select_one(".channel-videos-title a")
+            video = {
+                "id": link["href"].split("/")[-2],
+                "thread_id": link["href"].split("/")[-2],
+                "subject": link.text,
+                "body": strip_tags(video_element.select_one(".channel-videos-text").text),
+                "author": container_soup.select_one(".details .name a").text,
+                "author_id": container_soup.select_one(".details .name a")["href"].split("/")[2],
+                "timestamp": int(
+                    dateparser.parse(
+                        video_element.select_one(".channel-videos-details.text-right.hidden-xs").text).timestamp()),
+                "url": "https://www.bitchute.com" + link["href"],
+                "views": video_element.select_one(".video-views").text.strip(),
+                "length": video_element.select_one(".video-duration").text.strip(),
+                "thumbnail_image": video_element.select_one(".channel-videos-image img")["src"],
+            }
+
+            if detail != "basic":
+                video, comments = append_details(video, detail)
+                if not video:
+                    # unrecoverable error while scraping details
+                    return
+
+            yield video
+            for comment in comments:
+                # these need to be yielded *after* the video because else the result file will have the comments
+                # before the video, which is weird
+                yield comment
+#-----------------------------------------------------------------------------#
+
+def get_about(user):
+    """
+    Extract fields from channel's "About" tab
+    """
+    base_url = "https://www.bitchute.com/channel/%s/" % user
+    
+    response = requests.get(base_url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    
+    about_soup = soup.find('div', {'id' : 'channel-about'})
+    info_list = about_soup.find('div', {'class' : 'channel-about-details'}).find_all('p')
+    description_soup = about_soup.find('div', {'id' : 'channel-description'})
+
+    about = {
+        'description' : description_soup.text,
+        'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
+        'created': re.sub('\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
+        'videos' : int(info_list[1].text.split('videos')[0].strip()),
+        'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
+        'owner_name' : soup.find('p', {'class' : 'owner'}).text,
+        'category' : info_list[-1].text.split('Category')[1].strip(),
+        'image' : about_soup.find('img', {'alt' : 'Channel Image'})['data-src']
+    }
+    
+    return about
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -25,13 +25,14 @@ class GettrScraper(cisticola.scraper.Scraper):
            if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
                break

-            posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
-                                                      platform="Gettr",
-                                                      channel=username,
-                                                      platform_id=post['_id'],
-                                                      date=datetime.fromtimestamp(post['cdate']/1000.),
-                                                      date_archived=datetime.now(),
-                                                      raw_data=json.dumps(post)))
+            posts.append(cisticola.base.ScraperResult(
+                scraper=self.__version__,
+                platform="Gettr",
+                channel=username,
+                platform_id=post['_id'],
+                date=datetime.fromtimestamp(post['cdate']/1000.),
+                date_archived=datetime.now(),
+                raw_data=json.dumps(post)))

        return posts

--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -23,16 +23,17 @@ class TwitterScraper(cisticola.scraper.Scraper):
            TwitterScraper.get_username_from_url(channel.url))

        for tweet in scraper.get_items():
-            if since is not None and tweet.id <= int(since.platform_id):
+            if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp():
                break

-            posts.append(cisticola.base.ScraperResult(scraper=self.__version__,
-                                                      platform="Twitter",
-                                                      channel=channel.id,
-                                                      platform_id=tweet.id,
-                                                      date=tweet.date,
-                                                      date_archived=datetime.now(),
-                                                      raw_data=tweet.json()))
+            posts.append(cisticola.base.ScraperResult(
+                scraper=self.__version__,
+                platform="Twitter",
+                channel=channel.id,
+                platform_id=tweet.id,
+                date=tweet.date,
+                date_archived=datetime.now(),
+                raw_data=tweet.json()))

        return posts

--- a/test.py
+++ b/test.py
@@ -4,6 +4,7 @@

 import cisticola
 import cisticola.scraper.twitter
+
 from sqlalchemy import create_engine


@@ -19,6 +20,10 @@ test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", plat
                 cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
                                   category="qanon", followers=None, platform="Gettr",
                                   url="https://www.gettr.com/user/lizardrepublic", country="US",
+                                   influencer=None, public=True, chat=False, notes=""),
+                 cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
+                                   category="nazi", followers=None, platform="Bitchute",
+                                   url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
                                   influencer=None, public=True, chat=False, notes=""),]