mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 21:38:33 +03:00
added Bitchute scraper, minor change to Bitchute scraper to correctly extract author name and id
This commit is contained in:
@@ -236,8 +236,8 @@ def append_details(video, detail):
|
||||
video["video_url"] = soup.select_one("video#player source").get("src")
|
||||
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
|
||||
video["subject"] = soup.select_one("h1#video-title").text
|
||||
video["author"] = soup.select_one("div.channel-banner p.name a").text
|
||||
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
|
||||
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
|
||||
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
|
||||
|
||||
# we need *two more requests* to get the comment count and like/dislike counts
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
from . import base
|
||||
from .twitter import TwitterTransformer
|
||||
from .twitter import TwitterTransformer
|
||||
from .bitchute import BitchuteTransformer
|
||||
51
cisticola/transformer/bitchute.py
Normal file
51
cisticola/transformer/bitchute.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
|
||||
__version__ = "BitchuteTransformer 0.0.1"
|
||||
|
||||
def can_handle(self, data: ScraperResult) -> bool:
|
||||
scraper = data.scraper.split(' ')
|
||||
if scraper[0] == "BitchuteScraper":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> TransformedResult:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
transformed = TransformedResult(
|
||||
raw_id=data.id,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
url=raw['url'],
|
||||
content=content,
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author'])
|
||||
|
||||
return transformed
|
||||
Reference in New Issue
Block a user