updated transformer tests

This commit is contained in:
Tristan Lee
2022-05-19 16:34:19 -05:00
parent 424c063ef2
commit f0414a4f4d
7 changed files with 187 additions and 1253 deletions

View File

@@ -9,6 +9,7 @@ loguru = "*"
gogettr = "*"
requests = "*"
bs4 = "*"
lxml = "*"
dateparser = "*"
boto3 = "*"
ffmpeg-python = "*"

1336
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -421,6 +421,9 @@ class ScraperController:
session.commit()
added += 1
profile = scraper.get_profile(channel)
session.add(profile)
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")

View File

@@ -61,7 +61,7 @@ class TelegramTelethonTransformer(Transformer):
self.bad_channels[orig_screenname] = True
return ""
soup = BeautifulSoup(r.content)
soup = BeautifulSoup(r.content, features = 'lxml')
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
name = ""
@@ -181,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
transformed = insert(transformed)
# for k in data.archived_urls:
# if data.archived_urls[k]:
# archived_url = data.archived_urls[k]
# ext = archived_url.split('.')[-1]
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
ext = archived_url.split('.')[-1]
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
# else:
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))

View File

@@ -2,9 +2,10 @@ import json
from loguru import logger
from typing import Generator, Union, Callable
import dateutil.parser
from datetime import datetime, timezone
from cisticola.transformer.base import Transformer
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
class TwitterTransformer(Transformer):
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
@@ -45,8 +46,33 @@ class TwitterTransformer(Transformer):
yield m
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
transformed = ChannelInfo(
raw_channel_info_id=data.id,
channel=data.channel,
platform_id=raw['id'],
platform=data.platform,
scraper=data.scraper,
transformer=self.__version__,
screenname=raw['username'],
name=raw['displayname'],
description=raw['rawDescription'],
description_url=raw['linkUrl'],
description_location=raw['location'],
followers=raw['followersCount'],
following=raw['friendsCount'],
verified=raw['verified'],
date_created=dateutil.parser.parse(raw['created']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc)
)
transformed = insert(transformed)
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)
transformed = Post(
@@ -58,6 +84,7 @@ class TwitterTransformer(Transformer):
channel=data.channel,
date=dateutil.parser.parse(raw['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=raw['url'],
content=raw['content'],
author_id=raw['user']['id'],
@@ -85,6 +112,7 @@ class TwitterTransformer(Transformer):
channel=channel.id,
date=dateutil.parser.parse(tweet['date']),
date_archived=data.date_archived,
date_transformed=datetime.now(timezone.utc),
url=tweet['url'],
content=tweet['content'],
author_id=tweet['user']['id'],
@@ -109,7 +137,4 @@ class TwitterTransformer(Transformer):
media = self.process_media(raw, transformed.id, data)
for m in media:
insert(m)
insert(m)

View File

@@ -0,0 +1,34 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
import json
import pytest
from cisticola.base import Channel
from cisticola.scraper import TelegramTelethonScraper
from cisticola.transformer import TelegramTelethonTransformer
from cisticola.base import Post, Media
@pytest.mark.media
def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['telegram'])]
controller.register_scraper(scraper = TelegramTelethonScraper())
controller.scrape_channels(channels = channels, archive_media = True)
etl_controller.register_transformer(TelegramTelethonTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
session = sessionfactory()
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 19
assert len(media) == 13
assert posts[16].content == "Taking pre-orders now"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"

View File

@@ -1,4 +1,4 @@
from sqlalchemy.orm import sessionmaker, with_polymorphic
from sqlalchemy.orm import sessionmaker
import json
import pytest
@@ -18,6 +18,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
etl_controller.register_transformer(TwitterTransformer())
etl_controller.transform_all_untransformed()
etl_controller.transform_all_untransformed_info()
sessionfactory = sessionmaker()
sessionfactory.configure(bind=engine)
@@ -26,8 +27,8 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
posts = session.query(Post).all()
media = session.query(Media).all()
assert len(posts) == 10
assert len(media) == 7
assert len(posts) == 12
assert len(media) == 4
assert posts[-1].content == "BARN"
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
assert posts[2].content == "BARN"
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"