From e2094522c9f814b68b376c2ff981ab2575086876 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 19 May 2022 18:13:50 -0500 Subject: [PATCH] updated Bitchute transformer and addewd test --- cisticola/scraper/bitchute.py | 19 +++++++++++--- cisticola/transformer/bitchute.py | 41 ++++++++++++++++++++++++++----- tests/transformer/bitchute.py | 34 +++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 10 deletions(-) create mode 100644 tests/transformer/bitchute.py diff --git a/cisticola/scraper/bitchute.py b/cisticola/scraper/bitchute.py index c0cedc9..9942750 100644 --- a/cisticola/scraper/bitchute.py +++ b/cisticola/scraper/bitchute.py @@ -5,6 +5,7 @@ from html.parser import HTMLParser import dateparser import json from typing import Generator +from dateutil.relativedelta import relativedelta import requests from bs4 import BeautifulSoup @@ -70,7 +71,7 @@ class BitchuteScraper(Scraper): if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None: return True - @logger.catch + @logger.catch(reraise = True) def get_profile(self, channel: Channel) -> RawChannelInfo: base_url = channel.url @@ -104,7 +105,7 @@ class BitchuteScraper(Scraper): profile = { 'description' : description_soup.text.strip(), 'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)], - 'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')), + 'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))), 'videos' : int(info_list[1].text.split('videos')[0].strip()), 'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'], 'owner_name' : owner_name, @@ -116,7 +117,7 @@ class BitchuteScraper(Scraper): return RawChannelInfo(scraper=self.__version__, platform=channel.platform, channel=channel.id, - raw_data=json.dumps(profile), + raw_data=json.dumps(profile, default = str), date_archived=datetime.now(timezone.utc)) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @@ -484,4 +485,14 @@ def decode_cfemail(cfemail): return email -#---------------------------------------------------------------------------# \ No newline at end of file +#---------------------------------------------------------------------------# + +def parse_created(created): + + period_list = ['year', 'month', 'week', 'day'] + + periods = [period.strip() for period in created.split('ago')[0].strip().split(',')] + _kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()} + kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()} + + return datetime.now() - relativedelta(**kwargs) \ No newline at end of file diff --git a/cisticola/transformer/bitchute.py b/cisticola/transformer/bitchute.py index d0c5fe0..19fac56 100644 --- a/cisticola/transformer/bitchute.py +++ b/cisticola/transformer/bitchute.py @@ -1,11 +1,13 @@ import json from loguru import logger -from typing import Generator +from typing import Generator, Union, Callable +from datetime import datetime, timezone +import dateutil.parser from bs4 import BeautifulSoup from cisticola.transformer.base import Transformer -from cisticola.base import ScraperResult, Post, Image, Video, Media +from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo class BitchuteTransformer(Transformer): """A Bitchute specific ScraperResult, with a method ETL/transforming""" @@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer): return False - def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]: + def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]: raw = json.loads(data.raw_data) orig = raw['video_url'] @@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer): m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig) - yield m + insert(m) - def transform(self, data: ScraperResult) -> Post: + def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: + raw = json.loads(data.raw_data) + + transformed = ChannelInfo( + raw_channel_info_id=data.id, + channel=data.channel, + platform_id=raw['owner_url'].strip('/').split('/')[-1], + platform=data.platform, + scraper=data.scraper, + transformer=self.__version__, + screenname=raw['owner_name'], + name=raw['owner_name'], + description=raw['description'], + description_url='', # does not exist for Bitchute + description_location='', # does not exist for Bitchute + followers=raw['subscribers'], + following=-1, # does not exist for Bitchute + verified=False, # does not exist for Bitchute + date_created=dateutil.parser.parse(raw['created']), + date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc) + ) + + transformed = insert(transformed) + + def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]: raw = json.loads(data.raw_data) soup = BeautifulSoup(raw['body'], features = 'html.parser') @@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer): transformed = Post( raw_id=data.id, + platform_id=raw['id'], scraper=data.scraper, transformer=self.__version__, platform=data.platform, channel=data.channel, date=data.date, date_archived=data.date_archived, + date_transformed=datetime.now(timezone.utc), url=raw['url'], content=content, author_id=raw['author_id'], author_username=raw['author']) - return transformed + transformed = insert(transformed) \ No newline at end of file diff --git a/tests/transformer/bitchute.py b/tests/transformer/bitchute.py new file mode 100644 index 0000000..161d3e5 --- /dev/null +++ b/tests/transformer/bitchute.py @@ -0,0 +1,34 @@ +from sqlalchemy.orm import sessionmaker +import json + +import pytest + +from cisticola.base import Channel +from cisticola.scraper import BitchuteScraper +from cisticola.transformer import BitchuteTransformer +from cisticola.base import Post, Media + +@pytest.mark.media +def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs): + controller.reset_db() + + channels = [Channel(**channel_kwargs['bitchute'])] + controller.register_scraper(scraper = BitchuteScraper()) + controller.scrape_channels(channels = channels, archive_media = True) + + etl_controller.register_transformer(BitchuteTransformer()) + etl_controller.transform_all_untransformed() + etl_controller.transform_all_untransformed_info() + + sessionfactory = sessionmaker() + sessionfactory.configure(bind=engine) + session = sessionfactory() + + posts = session.query(Post).all() + media = session.query(Media).all() + + assert len(posts) == 5 + # assert len(media) == 0 + + assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content + # assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728" \ No newline at end of file