added Bitchute scraper, minor change to Bitchute scraper to correctly extract author name and id

This commit is contained in:
Tristan Lee
2022-03-10 13:03:01 -06:00
parent 5783206ad8
commit 3d919316a9
3 changed files with 55 additions and 3 deletions

View File

@@ -236,8 +236,8 @@ def append_details(video, detail):
video["video_url"] = soup.select_one("video#player source").get("src")
video["thumbnail_image"] = soup.select_one("video#player").get("poster")
video["subject"] = soup.select_one("h1#video-title").text
video["author"] = soup.select_one("div.channel-banner p.name a").text
video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2]
video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2]
video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip()
# we need *two more requests* to get the comment count and like/dislike counts

View File

@@ -1,2 +1,3 @@
from . import base
from .twitter import TwitterTransformer
from .twitter import TwitterTransformer
from .bitchute import BitchuteTransformer

View File

@@ -0,0 +1,51 @@
import json
from loguru import logger
from typing import Generator
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
__version__ = "BitchuteTransformer 0.0.1"
def can_handle(self, data: ScraperResult) -> bool:
scraper = data.scraper.split(' ')
if scraper[0] == "BitchuteScraper":
return True
return False
def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
new = data.archived_urls[orig]
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
def transform(self, data: ScraperResult) -> TransformedResult:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
content = soup.find_all('p')[-1].text
transformed = TransformedResult(
raw_id=data.id,
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
url=raw['url'],
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
return transformed