diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index a5292aa..8a365f4 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -236,8 +236,8 @@ def append_details(video, detail): video["video_url"] = soup.select_one("video#player source").get("src") video["thumbnail_image"] = soup.select_one("video#player").get("poster") video["subject"] = soup.select_one("h1#video-title").text - video["author"] = soup.select_one("div.channel-banner p.name a").text - video["author_id"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] + video["author_id"] = soup.select_one("p.owner a").get("href").split("/")[2] + video["author"] = soup.select_one("div.channel-banner p.name a").get("href").split("/")[2] video["body"] = soup.select_one("div#video-description").encode_contents().decode("utf-8").strip() # we need *two more requests* to get the comment count and like/dislike counts diff --git a/cisticola/transformer/__init__.py b/cisticola/transformer/__init__.py index e3a4b49..78cca55 100644 --- a/cisticola/transformer/__init__.py +++ b/cisticola/transformer/__init__.py @@ -1,2 +1,3 @@ from . import base -from .twitter import TwitterTransformer \ No newline at end of file +from .twitter import TwitterTransformer +from .bitchute import BitchuteTransformer \ No newline at end of file diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py new file mode 100644 index 0000000..de6f0a7 --- /dev/null +++ b/cisticola/transformer/bitchute.py @@ -0,0 +1,51 @@ +import json +from loguru import logger +from typing import Generator + +from bs4 import BeautifulSoup + +from cisticola.transformer.base import Transformer +from cisticola.base import ScraperResult, TransformedResult, Image, Video, Media + +class BitchuteTransformer(Transformer): + """A Bitchute specific ScraperResult, with a method ETL/transforming""" + + __version__ = "BitchuteTransformer 0.0.1" + + def can_handle(self, data: ScraperResult) -> bool: + scraper = data.scraper.split(' ') + if scraper[0] == "BitchuteScraper": + return True + + return False + + def transform_media(self, data: ScraperResult, transformed: TransformedResult) -> Generator[Media, None, None]: + raw = json.loads(data.raw_data) + + orig = raw['video_url'] + new = data.archived_urls[orig] + + m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) + + yield m + + def transform(self, data: ScraperResult) -> TransformedResult: + raw = json.loads(data.raw_data) + + soup = BeautifulSoup(raw['body'], features = 'html.parser') + content = soup.find_all('p')[-1].text + + transformed = TransformedResult( + raw_id=data.id, + scraper=data.scraper, + transformer=self.__version__, + platform=data.platform, + channel=data.channel, + date=data.date, + date_archived=data.date_archived, + url=raw['url'], + content=content, + author_id=raw['author_id'], + author_username=raw['author']) + + return transformed