mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
converted bitchute to yield, got video archiving working on bitchute and gettr, added url_to_blob method that downloads media bytes blob from url and converted archive_media to take in the media bytes blob instead of the media url.
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -13,6 +13,7 @@ dateparser = "*"
|
||||
sphinx = "*"
|
||||
boto3 = "*"
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
ffmpeg-python = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
||||
41
Pipfile.lock
generated
41
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
|
||||
"sha256": "f4f00b78a16b39eeb122566ec4cc6bf2dfeae044ae95a281e352e00850c74cc6"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -41,19 +41,19 @@
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
|
||||
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
|
||||
"sha256:8f59383fe578ac9107466a464d7198933e5332d85a4790f2e01cf24a4a7f635b",
|
||||
"sha256:af92931f65e33e7450c3389c6cc6ab6b3e2f619697ea5566aacc0f16f3b21f61"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.6"
|
||||
"version": "==1.21.7"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
|
||||
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
|
||||
"sha256:5d1a2a2ac72461bbaa79317b3e4cb72c7ebb315aef184d90f72ec1f6dba0ca6c",
|
||||
"sha256:a34118bfadc02903ab404148822fe5a6de7a3bb58943f1a6a19cc8b0446d2a50"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.6"
|
||||
"version": "==1.24.7"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
@@ -101,6 +101,14 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==0.17.1"
|
||||
},
|
||||
"ffmpeg-python": {
|
||||
"hashes": [
|
||||
"sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127",
|
||||
"sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.0"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:9cd540a9352e432c7246a48fe4e8712b10acb1df2ad1f30e8c070b82ae1fed85",
|
||||
@@ -109,6 +117,13 @@
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.6.0"
|
||||
},
|
||||
"future": {
|
||||
"hashes": [
|
||||
"sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.18.2"
|
||||
},
|
||||
"gogettr": {
|
||||
"hashes": [
|
||||
"sha256:9f5c90e3b1befe6eb561d4bca9ca124faddbe5787d8b429f02703c68dd51d255",
|
||||
@@ -175,7 +190,7 @@
|
||||
"sha256:fa877ca7f6b48054f847b61d6fa7bed5cebb663ebc55e018fda12db09dcc664c",
|
||||
"sha256:fdcec0b8399108577ec290f55551d926d9a1fa6cad45882093a7a07ac5ec147b"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"markers": "python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))",
|
||||
"version": "==1.1.2"
|
||||
},
|
||||
"idna": {
|
||||
@@ -474,9 +489,7 @@
|
||||
"version": "==2022.1.18"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"extras": [],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -486,11 +499,11 @@
|
||||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
|
||||
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
|
||||
"sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971",
|
||||
"sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.5.1"
|
||||
"version": "==0.5.2"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Generator
|
||||
from typing import Generator, Tuple
|
||||
import cisticola.base
|
||||
import requests
|
||||
import os
|
||||
@@ -24,7 +24,8 @@ class Scraper:
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def archive_media(self, url: str, key: str = None) -> str:
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
n_retries = 0
|
||||
r = requests.get(url)
|
||||
|
||||
@@ -38,13 +39,16 @@ class Scraper:
|
||||
return url
|
||||
|
||||
blob = r.content
|
||||
|
||||
content_type = r.headers.get('Content-Type')
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_media(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
|
||||
|
||||
@@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
import cisticola.base
|
||||
|
||||
class BitchuteScraper(cisticola.scraper.Scraper):
|
||||
class BitchuteScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Bitchute, using classes from the 4cat
|
||||
library"""
|
||||
__version__ = "BitchuteScraper 0.0.1"
|
||||
@@ -34,28 +34,33 @@ class BitchuteScraper(cisticola.scraper.Scraper):
|
||||
|
||||
# Don't scrape comment information
|
||||
#TODO implement framework for processing and storing comments
|
||||
detail = 'basic'
|
||||
detail = 'comments'
|
||||
|
||||
posts = []
|
||||
username = BitchuteScraper.get_username_from_url(channel.url)
|
||||
scraper = get_videos_user(session, username, csrftoken, detail)
|
||||
|
||||
for i, post in enumerate(scraper):
|
||||
for post in scraper:
|
||||
|
||||
if since is not None and post['timestamp'] <= since.date_archived.timestamp():
|
||||
print( f'\n\nBREAK ON VIDEO: {i}\n\n')
|
||||
if since is not None and datetime.fromtimestamp(post['timestamp']) <= since.date:
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
archived_urls = {}
|
||||
|
||||
if 'video_url' in post:
|
||||
url = post['video_url']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Bitchute",
|
||||
channel=channel.id,
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromtimestamp(post['timestamp']),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
|
||||
return posts
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and BitchuteScraper.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -2,9 +2,10 @@ import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Generator
|
||||
from typing import Generator, Tuple
|
||||
from gogettr import PublicClient
|
||||
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
@@ -30,16 +31,20 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
archived_url = self.archive_media(url)
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[img] = archived_url
|
||||
|
||||
if 'main' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
|
||||
url = "https://media.gettr.com/" + post['main']
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
# TODO this is just archiving the playlist file, not the actual video
|
||||
if 'vid' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
|
||||
url = "https://media.gettr.com/" + post['vid']
|
||||
media_blob, content_type, key = self.m3u8_url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post['vid']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
@@ -55,3 +60,26 @@ class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def m3u8_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
# Using mkv might be more robust: https://stackoverflow.com/a/42871067
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix = ext) as temp_file:
|
||||
|
||||
(
|
||||
ffmpeg
|
||||
.input(url)
|
||||
.output(temp_file.name, vcodec='copy')
|
||||
.global_args('-loglevel', 'error')
|
||||
.run(overwrite_output=True))
|
||||
|
||||
temp_file.seek(0)
|
||||
blob = temp_file.read()
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
@@ -25,12 +25,14 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
archived_urls = {}
|
||||
|
||||
for image_url in post.images:
|
||||
archive_url = self.archive_media(image_url)
|
||||
archived_urls[image_url] = archive_url
|
||||
media_blob, content_type, key = self.url_to_blob(image_url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[image_url] = archived_url
|
||||
|
||||
if post.video:
|
||||
video_archive_url = self.archive_media(post.video)
|
||||
archived_urls[post.video] = video_archive_url
|
||||
media_blob, content_type, key = self.url_to_blob(post.video)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[post.video] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
|
||||
@@ -41,7 +41,8 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
url = None
|
||||
|
||||
if url is not None:
|
||||
archived_url = self.archive_media(url)
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_media(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
|
||||
13
test.py
13
test.py
@@ -2,6 +2,7 @@ import cisticola
|
||||
import cisticola.scraper.telegram_snscrape
|
||||
import cisticola.scraper.twitter
|
||||
import cisticola.scraper.gettr
|
||||
import cisticola.scraper.bitchute
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
@@ -20,10 +21,11 @@ test_channels = [
|
||||
category="qanon", followers=None, platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
|
||||
category="nazi", followers=None, platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),]
|
||||
cisticola.base.Channel(
|
||||
id=4, name="bestonlinejewelrystoresusa@gmail.com", platform_id='bestonlinejewelrystoresusagmailcom',
|
||||
category="spam", followers=None, platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/bestonlinejewelrystoresusagmailcom/", screenname=None, country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),]
|
||||
|
||||
|
||||
controller = cisticola.ScraperController()
|
||||
@@ -37,6 +39,9 @@ controller.register_scraper(telegram)
|
||||
gettr = cisticola.scraper.gettr.GettrScraper()
|
||||
controller.register_scraper(gettr)
|
||||
|
||||
bitchute = cisticola.scraper.bitchute.BitchuteScraper()
|
||||
controller.register_scraper(gettr)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user