converted bitchute to yield, got video archiving working on bitchute and gettr, added url_to_blob method that downloads media bytes blob from url and converted archive_media to take in the media bytes blob instead of the media url.

2026-06-08 03:18:34 +03:00 · 2022-02-25 13:43:30 -06:00
parent 8ab56ff5ba
commit ef83cc4b0a
8 changed files with 101 additions and 42 deletions
--- a/1
+++ b/1
@@ -13,6 +13,7 @@ dateparser = "*"
 sphinx = "*"
 boto3 = "*"
 snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
+ffmpeg-python = "*"

 [dev-packages]

--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
+            "sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -41,19 +41,19 @@
        },
        "boto3": {
            "hashes": [
-                "sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
-                "sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
+                "sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b",
+                "sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61"
            ],
            "index": "pypi",
-            "version": "==1.21.6"
+            "version": "==1.21.7"
        },
        "botocore": {
            "hashes": [
-                "sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
-                "sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
+                "sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c",
+                "sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==1.24.6"
+            "version": "==1.24.7"
        },
        "bs4": {
            "hashes": [
@@ -101,6 +101,14 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==0.17.1"
        },
+        "ffmpeg-python": {
+            "hashes": [
+                "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
+                "sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5"
+            ],
+            "index": "pypi",
+            "version": "==0.2.0"
+        },
        "filelock": {
            "hashes": [
                "sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -109,6 +117,13 @@
            "markers": "python_version >= '3.7'",
            "version": "==3.6.0"
        },
+        "future": {
+            "hashes": [
+                "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
+            ],
+            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.18.2"
+        },
        "gogettr": {
            "hashes": [
                "sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
@@ -175,7 +190,7 @@
                "sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
                "sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
            "version": "==1.1.2"
        },
        "idna": {
@@ -474,9 +489,7 @@
            "version": "==2022.1.18"
        },
        "requests": {
-            "extras": [
-                "socks"
-            ],
+            "extras": [],
            "hashes": [
                "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
                "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -486,11 +499,11 @@
        },
        "s3transfer": {
            "hashes": [
-                "sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
-                "sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
+                "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
+                "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
            ],
            "markers": "python_version >= '3.6'",
-            "version": "==0.5.1"
+            "version": "==0.5.2"
        },
        "six": {
            "hashes": [
--- a/cisticola/scraper/base.py
+++ b/cisticola/scraper/base.py
@@ -1,4 +1,4 @@
-from typing import Generator
+from typing import Generator, Tuple
 import cisticola.base
 import requests
 import os
@@ -24,7 +24,8 @@ class Scraper:
    def __str__(self):
        return self.__version__

-    def archive_media(self, url: str, key: str = None) -> str:
+    def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+
        n_retries = 0
        r = requests.get(url)

@@ -38,13 +39,16 @@ class Scraper:
            return url

        blob = r.content
-        
        content_type = r.headers.get('Content-Type')

        if key is None:
            key = url.split('/')[-1]
            key = key.split('?')[0]

+        return blob, content_type, key
+
+    def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
+
        filename = self.__version__.replace(' ', '_') + '/' + key

        self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
--- a/cisticola/scraper/bitchute.py
+++ b/cisticola/scraper/bitchute.py
@@ -11,7 +11,7 @@ from bs4 import BeautifulSoup

 import cisticola.base

-class BitchuteScraper(cisticola.scraper.Scraper):
+class BitchuteScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Bitchute, using classes from the 4cat
    library"""
    __version__ = "BitchuteScraper 0.0.1"
@@ -34,28 +34,33 @@ class BitchuteScraper(cisticola.scraper.Scraper):

        # Don't scrape comment information 
        #TODO implement framework for processing and storing comments
-        detail = 'basic'
+        detail = 'comments'

-        posts = []
        username = BitchuteScraper.get_username_from_url(channel.url)
        scraper = get_videos_user(session, username, csrftoken, detail)

-        for i, post in enumerate(scraper):
+        for post in scraper:

-            if since is not None and post['timestamp'] <= since.date_archived.timestamp():
-                print( f'\n\nBREAK ON VIDEO: {i}\n\n')
+            if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
                break

-            posts.append(cisticola.base.ScraperResult(
+            archived_urls = {}
+
+            if 'video_url' in post:
+                url = post['video_url']
+                media_blob, content_type, key = self.url_to_blob(url)
+                archived_url = self.archive_media(media_blob, content_type, key)
+                archived_urls[url] = archived_url
+
+            yield cisticola.base.ScraperResult(
                scraper=self.__version__,
                platform="Bitchute",
                channel=channel.id,
                platform_id=post['id'],
                date=datetime.fromtimestamp(post['timestamp']),
                date_archived=datetime.now(),
-                raw_data=json.dumps(post)))
-
-        return posts
+                raw_data=json.dumps(post),
+                archived_urls=archived_urls)

    def can_handle(self, channel):
        if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
--- a/cisticola/scraper/gettr.py
+++ b/cisticola/scraper/gettr.py
@@ -2,9 +2,10 @@ import cisticola.base
 import cisticola.scraper.base
 from datetime import datetime
 import json
-from typing import Generator
+from typing import Generator, Tuple
 from gogettr import PublicClient
-
+import ffmpeg
+import tempfile
 class GettrScraper(cisticola.scraper.base.Scraper):
    """An implementation of a Scraper for Gettr, using gogettr library"""
    __version__ = "GettrScraper 0.0.1"
@@ -30,16 +31,20 @@ class GettrScraper(cisticola.scraper.base.Scraper):
            if 'imgs' in post:
                for img in post['imgs']:
                    url = "https://media.gettr.com/" + img
-                    archived_url = self.archive_media(url)
+                    media_blob, content_type, key = self.url_to_blob(url)
+                    archived_url = self.archive_media(media_blob, content_type, key)
                    archived_urls[img] = archived_url

            if 'main' in post:
-                archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
+                url = "https://media.gettr.com/" + post['main']
+                media_blob, content_type, key = self.url_to_blob(url)
+                archived_url = self.archive_media(media_blob, content_type, key)
                archived_urls[post['main']] = archived_url

-            # TODO this is just archiving the playlist file, not the actual video
            if 'vid' in post:
-                archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
+                url = "https://media.gettr.com/" + post['vid']
+                media_blob, content_type, key = self.m3u8_url_to_blob(url)
+                archived_url = self.archive_media(media_blob, content_type, key)
                archived_urls[post['vid']] = archived_url

            yield cisticola.base.ScraperResult(
@@ -55,3 +60,26 @@ class GettrScraper(cisticola.scraper.base.Scraper):
    def can_handle(self, channel):
        if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
            return True
+
+    def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
+        
+        # Using mkv might be more robust: https://stackoverflow.com/a/42871067
+        content_type = 'video/mp4'
+        ext = '.' + content_type.split('/')[-1]
+
+        with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
+            
+            (
+                ffmpeg
+                .input(url)
+                .output(temp_file.name, vcodec='copy')
+                .global_args('-loglevel', 'error')
+                .run(overwrite_output=True))
+            
+            temp_file.seek(0)
+            blob = temp_file.read()
+
+        if key is None:
+            key = url.split('/')[-2] + ext
+
+        return blob, content_type, key
--- a/cisticola/scraper/telegram_snscrape.py
+++ b/cisticola/scraper/telegram_snscrape.py
@@ -25,12 +25,14 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
            archived_urls = {}

            for image_url in post.images:
-                archive_url = self.archive_media(image_url)
-                archived_urls[image_url] = archive_url
+                media_blob, content_type, key = self.url_to_blob(image_url)
+                archived_url = self.archive_media(media_blob, content_type, key)
+                archived_urls[image_url] = archived_url

            if post.video:
-                video_archive_url = self.archive_media(post.video)
-                archived_urls[post.video] = video_archive_url
+                media_blob, content_type, key = self.url_to_blob(post.video)
+                archived_url = self.archive_media(media_blob, content_type, key)
+                archived_urls[post.video] = archived_url

            yield cisticola.base.ScraperResult(
                scraper=self.__version__,
--- a/cisticola/scraper/twitter.py
+++ b/cisticola/scraper/twitter.py
@@ -41,7 +41,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
                        url = None

                    if url is not None:
-                        archived_url = self.archive_media(url)
+                        media_blob, content_type, key = self.url_to_blob(url)
+                        archived_url = self.archive_media(media_blob, content_type, key)
                        archived_urls[url] = archived_url

            yield cisticola.base.ScraperResult(
--- a/test.py
+++ b/test.py
@@ -2,6 +2,7 @@ import cisticola
 import cisticola.scraper.telegram_snscrape
 import cisticola.scraper.twitter
 import cisticola.scraper.gettr
+import cisticola.scraper.bitchute

 from sqlalchemy import create_engine

@@ -20,10 +21,11 @@ test_channels = [
                                   category="qanon", followers=None, platform="Gettr",
                                   url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
                                   influencer=None, public=True, chat=False, notes=""),
-                 cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
-                                   category="nazi", followers=None, platform="Bitchute",
-                                   url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
-                                   influencer=None, public=True, chat=False, notes=""),]
+                cisticola.base.Channel(
+                                    id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom',
+                                    category="spam", followers=None, platform="Bitchute",
+                                    url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US",
+                                    influencer=None, public=True, chat=False, notes=""),]


 controller = cisticola.ScraperController()
@@ -37,6 +39,9 @@ controller.register_scraper(telegram)
 gettr = cisticola.scraper.gettr.GettrScraper()
 controller.register_scraper(gettr)

+bitchute = cisticola.scraper.bitchute.BitchuteScraper()
+controller.register_scraper(gettr)
+
 engine = create_engine('sqlite:///test3.db')
 controller.connect_to_db(engine)