converted bitchute to yield, got video archiving working on bitchute and gettr, added url_to_blob method that downloads media bytes blob from url and converted archive_media to take in the media bytes blob instead of the media url.

This commit is contained in:
Tristan Lee
2022-02-25 13:43:30 -06:00
parent 8ab56ff5ba
commit ef83cc4b0a
8 changed files with 101 additions and 42 deletions

View File

@@ -13,6 +13,7 @@ dateparser = "*"
sphinx = "*"
boto3 = "*"
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
ffmpeg-python = "*"
[dev-packages]

41
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
"sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6"
},
"pipfile-spec": 6,
"requires": {
@@ -41,19 +41,19 @@
},
"boto3": {
"hashes": [
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
"sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b",
"sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61"
],
"index": "pypi",
"version": "==1.21.6"
"version": "==1.21.7"
},
"botocore": {
"hashes": [
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
"sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c",
"sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50"
],
"markers": "python_version >= '3.6'",
"version": "==1.24.6"
"version": "==1.24.7"
},
"bs4": {
"hashes": [
@@ -101,6 +101,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==0.17.1"
},
"ffmpeg-python": {
"hashes": [
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
"sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5"
],
"index": "pypi",
"version": "==0.2.0"
},
"filelock": {
"hashes": [
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
@@ -109,6 +117,13 @@
"markers": "python_version >= '3.7'",
"version": "==3.6.0"
},
"future": {
"hashes": [
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
],
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.18.2"
},
"gogettr": {
"hashes": [
"sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
@@ -175,7 +190,7 @@
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
"version": "==1.1.2"
},
"idna": {
@@ -474,9 +489,7 @@
"version": "==2022.1.18"
},
"requests": {
"extras": [
"socks"
],
"extras": [],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -486,11 +499,11 @@
},
"s3transfer": {
"hashes": [
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
],
"markers": "python_version >= '3.6'",
"version": "==0.5.1"
"version": "==0.5.2"
},
"six": {
"hashes": [

View File

@@ -1,4 +1,4 @@
from typing import Generator
from typing import Generator, Tuple
import cisticola.base
import requests
import os
@@ -24,7 +24,8 @@ class Scraper:
def __str__(self):
return self.__version__
def archive_media(self, url: str, key: str = None) -> str:
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
n_retries = 0
r = requests.get(url)
@@ -38,13 +39,16 @@ class Scraper:
return url
blob = r.content
content_type = r.headers.get('Content-Type')
if key is None:
key = url.split('/')[-1]
key = key.split('?')[0]
return blob, content_type, key
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(

View File

@@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
import cisticola.base
class BitchuteScraper(cisticola.scraper.Scraper):
class BitchuteScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
library"""
__version__ = "BitchuteScraper 0.0.1"
@@ -34,28 +34,33 @@ class BitchuteScraper(cisticola.scraper.Scraper):
# Don't scrape comment information
#TODO implement framework for processing and storing comments
detail = 'basic'
detail = 'comments'
posts = []
username = BitchuteScraper.get_username_from_url(channel.url)
scraper = get_videos_user(session, username, csrftoken, detail)
for i, post in enumerate(scraper):
for post in scraper:
if since is not None and post['timestamp'] <= since.date_archived.timestamp():
print( f'\n\nBREAK ON VIDEO: {i}\n\n')
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
break
posts.append(cisticola.base.ScraperResult(
archived_urls = {}
if 'video_url' in post:
url = post['video_url']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,
platform="Bitchute",
channel=channel.id,
platform_id=post['id'],
date=datetime.fromtimestamp(post['timestamp']),
date_archived=datetime.now(),
raw_data=json.dumps(post)))
return posts
raw_data=json.dumps(post),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:

View File

@@ -2,9 +2,10 @@ import cisticola.base
import cisticola.scraper.base
from datetime import datetime
import json
from typing import Generator
from typing import Generator, Tuple
from gogettr import PublicClient
import ffmpeg
import tempfile
class GettrScraper(cisticola.scraper.base.Scraper):
"""An implementation of a Scraper for Gettr, using gogettr library"""
__version__ = "GettrScraper 0.0.1"
@@ -30,16 +31,20 @@ class GettrScraper(cisticola.scraper.base.Scraper):
if 'imgs' in post:
for img in post['imgs']:
url = "https://media.gettr.com/" + img
archived_url = self.archive_media(url)
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[img] = archived_url
if 'main' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
url = "https://media.gettr.com/" + post['main']
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['main']] = archived_url
# TODO this is just archiving the playlist file, not the actual video
if 'vid' in post:
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
url = "https://media.gettr.com/" + post['vid']
media_blob, content_type, key = self.m3u8_url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post['vid']] = archived_url
yield cisticola.base.ScraperResult(
@@ -55,3 +60,26 @@ class GettrScraper(cisticola.scraper.base.Scraper):
def can_handle(self, channel):
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
return True
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
# Using mkv might be more robust: https://stackoverflow.com/a/42871067
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
(
ffmpeg
.input(url)
.output(temp_file.name, vcodec='copy')
.global_args('-loglevel', 'error')
.run(overwrite_output=True))
temp_file.seek(0)
blob = temp_file.read()
if key is None:
key = url.split('/')[-2] + ext
return blob, content_type, key

View File

@@ -25,12 +25,14 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
archived_urls = {}
for image_url in post.images:
archive_url = self.archive_media(image_url)
archived_urls[image_url] = archive_url
media_blob, content_type, key = self.url_to_blob(image_url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
video_archive_url = self.archive_media(post.video)
archived_urls[post.video] = video_archive_url
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[post.video] = archived_url
yield cisticola.base.ScraperResult(
scraper=self.__version__,

View File

@@ -41,7 +41,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
url = None
if url is not None:
archived_url = self.archive_media(url)
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_media(media_blob, content_type, key)
archived_urls[url] = archived_url
yield cisticola.base.ScraperResult(

13
test.py
View File

@@ -2,6 +2,7 @@ import cisticola
import cisticola.scraper.telegram_snscrape
import cisticola.scraper.twitter
import cisticola.scraper.gettr
import cisticola.scraper.bitchute
from sqlalchemy import create_engine
@@ -20,10 +21,11 @@ test_channels = [
category="qanon", followers=None, platform="Gettr",
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
influencer=None, public=True, chat=False, notes=""),
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
category="nazi", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
influencer=None, public=True, chat=False, notes=""),]
cisticola.base.Channel(
id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom',
category="spam", followers=None, platform="Bitchute",
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US",
influencer=None, public=True, chat=False, notes=""),]
controller = cisticola.ScraperController()
@@ -37,6 +39,9 @@ controller.register_scraper(telegram)
gettr = cisticola.scraper.gettr.GettrScraper()
controller.register_scraper(gettr)
bitchute = cisticola.scraper.bitchute.BitchuteScraper()
controller.register_scraper(gettr)
engine = create_engine('sqlite:///test3.db')
controller.connect_to_db(engine)