mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 05:18:33 +03:00
updated Bitchute transformer and addewd test
This commit is contained in:
@@ -5,6 +5,7 @@ from html.parser import HTMLParser
|
||||
import dateparser
|
||||
import json
|
||||
from typing import Generator
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -70,7 +71,7 @@ class BitchuteScraper(Scraper):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
@logger.catch(reraise = True)
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
base_url = channel.url
|
||||
@@ -104,7 +105,7 @@ class BitchuteScraper(Scraper):
|
||||
profile = {
|
||||
'description' : description_soup.text.strip(),
|
||||
'description_links' : [a['href'] for a in description_soup.find_all('a', href = True)],
|
||||
'created': re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. ')),
|
||||
'created': parse_created(re.sub(r'\s', ' ', info_list[0].text.split('Created')[1].strip('. '))),
|
||||
'videos' : int(info_list[1].text.split('videos')[0].strip()),
|
||||
'owner_url' : soup.find('p', {'class' : 'owner'}).find('a', href = True)['href'],
|
||||
'owner_name' : owner_name,
|
||||
@@ -116,7 +117,7 @@ class BitchuteScraper(Scraper):
|
||||
return RawChannelInfo(scraper=self.__version__,
|
||||
platform=channel.platform,
|
||||
channel=channel.id,
|
||||
raw_data=json.dumps(profile),
|
||||
raw_data=json.dumps(profile, default = str),
|
||||
date_archived=datetime.now(timezone.utc))
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -484,4 +485,14 @@ def decode_cfemail(cfemail):
|
||||
|
||||
return email
|
||||
|
||||
#---------------------------------------------------------------------------#
|
||||
#---------------------------------------------------------------------------#
|
||||
|
||||
def parse_created(created):
|
||||
|
||||
period_list = ['year', 'month', 'week', 'day']
|
||||
|
||||
periods = [period.strip() for period in created.split('ago')[0].strip().split(',')]
|
||||
_kwargs = {period : int(number) for period, number in dict(reversed(p.split(' ')) for p in periods).items()}
|
||||
kwargs = {(k + 's' if k in period_list else k) : v for k, v in _kwargs.items()}
|
||||
|
||||
return datetime.now() - relativedelta(**kwargs)
|
||||
@@ -1,11 +1,13 @@
|
||||
import json
|
||||
from loguru import logger
|
||||
from typing import Generator
|
||||
from typing import Generator, Union, Callable
|
||||
from datetime import datetime, timezone
|
||||
import dateutil.parser
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media
|
||||
from cisticola.base import RawChannelInfo, ScraperResult, Post, Image, Video, Media, Channel, ChannelInfo
|
||||
|
||||
class BitchuteTransformer(Transformer):
|
||||
"""A Bitchute specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -19,7 +21,7 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
@@ -27,9 +29,34 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
|
||||
yield m
|
||||
insert(m)
|
||||
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['owner_url'].strip('/').split('/')[-1],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['owner_name'],
|
||||
name=raw['owner_name'],
|
||||
description=raw['description'],
|
||||
description_url='', # does not exist for Bitchute
|
||||
description_location='', # does not exist for Bitchute
|
||||
followers=raw['subscribers'],
|
||||
following=-1, # does not exist for Bitchute
|
||||
verified=False, # does not exist for Bitchute
|
||||
date_created=dateutil.parser.parse(raw['created']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
@@ -37,15 +64,17 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
platform_id=raw['id'],
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
platform=data.platform,
|
||||
channel=data.channel,
|
||||
date=data.date,
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=content,
|
||||
author_id=raw['author_id'],
|
||||
author_username=raw['author'])
|
||||
|
||||
return transformed
|
||||
transformed = insert(transformed)
|
||||
34
tests/transformer/bitchute.py
Normal file
34
tests/transformer/bitchute.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import BitchuteScraper
|
||||
from cisticola.transformer import BitchuteTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_bitchute(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['bitchute'])]
|
||||
controller.register_scraper(scraper = BitchuteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(BitchuteTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 5
|
||||
# assert len(media) == 0
|
||||
|
||||
assert 'Pendant are something that the advanced ladies can fuse in her every day look' in posts[0].content
|
||||
# assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user