updated Bitchute transformer and addewd test

This commit is contained in:
Tristan Lee
2022-05-19 18:13:50 -05:00
parent f0414a4f4d
commit e2094522c9
3 changed files with 84 additions and 10 deletions

View File

@@ -5,6 +5,7 @@ from html.parser import HTMLParser
import dateparser
import json
from typing import Generator
from dateutil.relativedelta import relativedelta
import requests
from bs4 import BeautifulSoup
@@ -70,7 +71,7 @@ class BitchuteScraper(Scraper):
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
return True
@logger.catch
@logger.catch(reraise = True)
def get_profile(self, channel: Channel) -> RawChannelInfo:
base_url = channel.url
@@ -104,7 +105,7 @@ class BitchuteScraper(Scraper):
profile = {
'description' : description_soup.text.strip(),
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
'videos' : int(info_list[1].text.split('videos')[0].strip()),
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
'owner_name' : owner_name,
@@ -116,7 +117,7 @@ class BitchuteScraper(Scraper):
return RawChannelInfo(scraper=self.__version__,
platform=channel.platform,
channel=channel.id,
raw_data=json.dumps(profile),
raw_data=json.dumps(profile, default = str),
date_archived=datetime.now(timezone.utc))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@@ -484,4 +485,14 @@ def decode_cfemail(cfemail):
return email
#---------------------------------------------------------------------------#
#---------------------------------------------------------------------------#
def parse_created(created):
period_list = ['year', 'month', 'week', 'day']
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
return datetime.now() - relativedelta(**kwargs)

View File

@@ -1,11 +1,13 @@
import json
from loguru import logger
from typing import Generator
from typing import Generator, Union, Callable
from datetime import datetime, timezone
import dateutil.parser
from bs4 import BeautifulSoup
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, Post, Image, Video, Media
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
class BitchuteTransformer(Transformer):
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
@@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
@@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer):
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
yield m
insert(m)
def transform(self, data: ScraperResult) -> Post:
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['owner_url'].strip('/').split('/')[-1],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['owner_name'],
name=raw['owner_name'],
description=raw['description'],
description_url='', # does not exist for Bitchute
description_location='', # does not exist for Bitchute
followers=raw['subscribers'],
following=-1, # does not exist for Bitchute
verified=False, # does not exist for Bitchute
date_created=dateutil.parser.parse(raw['created']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
soup = BeautifulSoup(raw['body'], features = 'html.parser')
@@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer):
transformed = Post(
raw_id=data.id,
platform_id=raw['id'],
scraper=data.scraper,
transformer=self.__version__,
platform=data.platform,
channel=data.channel,
date=data.date,
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=content,
author_id=raw['author_id'],
author_username=raw['author'])
return transformed
transformed = insert(transformed)

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import BitchuteScraper
from cisticola.transformer import BitchuteTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['bitchute'])]
controller.register_scraper(scraper = BitchuteScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(BitchuteTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 5
# assert len(media) == 0
assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"