mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
updated transformer tests
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -9,6 +9,7 @@ loguru = "*"
|
||||
gogettr = "*"
|
||||
requests = "*"
|
||||
bs4 = "*"
|
||||
lxml = "*"
|
||||
dateparser = "*"
|
||||
boto3 = "*"
|
||||
ffmpeg-python = "*"
|
||||
|
||||
1336
Pipfile.lock
generated
1336
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -421,6 +421,9 @@ class ScraperController:
|
||||
session.commit()
|
||||
added += 1
|
||||
|
||||
profile = scraper.get_profile(channel)
|
||||
session.add(profile)
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
|
||||
@@ -61,7 +61,7 @@ class TelegramTelethonTransformer(Transformer):
|
||||
self.bad_channels[orig_screenname] = True
|
||||
return ""
|
||||
|
||||
soup = BeautifulSoup(r.content)
|
||||
soup = BeautifulSoup(r.content, features = 'lxml')
|
||||
post = soup.findAll("div", {"data-post" : orig_screenname + "/" + str(id)})
|
||||
name = ""
|
||||
|
||||
@@ -181,14 +181,14 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
# for k in data.archived_urls:
|
||||
# if data.archived_urls[k]:
|
||||
# archived_url = data.archived_urls[k]
|
||||
# ext = archived_url.split('.')[-1]
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
ext = archived_url.split('.')[-1]
|
||||
|
||||
# if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
# insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
# else:
|
||||
# insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
else:
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
|
||||
|
||||
@@ -2,9 +2,10 @@ import json
|
||||
from loguru import logger
|
||||
from typing import Generator, Union, Callable
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.transformer.base import Transformer
|
||||
from cisticola.base import ScraperResult, Post, Image, Video, Media, Channel
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Image, Video, Media, Channel
|
||||
|
||||
class TwitterTransformer(Transformer):
|
||||
"""A Twitter specific ScraperResult, with a method ETL/transforming"""
|
||||
@@ -45,8 +46,33 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
yield m
|
||||
|
||||
def transform_info(self, data: RawChannelInfo, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
transformed = ChannelInfo(
|
||||
raw_channel_info_id=data.id,
|
||||
channel=data.channel,
|
||||
platform_id=raw['id'],
|
||||
platform=data.platform,
|
||||
scraper=data.scraper,
|
||||
transformer=self.__version__,
|
||||
screenname=raw['username'],
|
||||
name=raw['displayname'],
|
||||
description=raw['rawDescription'],
|
||||
description_url=raw['linkUrl'],
|
||||
description_location=raw['location'],
|
||||
followers=raw['followersCount'],
|
||||
following=raw['friendsCount'],
|
||||
verified=raw['verified'],
|
||||
date_created=dateutil.parser.parse(raw['created']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc)
|
||||
)
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
@@ -58,6 +84,7 @@ class TwitterTransformer(Transformer):
|
||||
channel=data.channel,
|
||||
date=dateutil.parser.parse(raw['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=raw['url'],
|
||||
content=raw['content'],
|
||||
author_id=raw['user']['id'],
|
||||
@@ -85,6 +112,7 @@ class TwitterTransformer(Transformer):
|
||||
channel=channel.id,
|
||||
date=dateutil.parser.parse(tweet['date']),
|
||||
date_archived=data.date_archived,
|
||||
date_transformed=datetime.now(timezone.utc),
|
||||
url=tweet['url'],
|
||||
content=tweet['content'],
|
||||
author_id=tweet['user']['id'],
|
||||
@@ -109,7 +137,4 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
media = self.process_media(raw, transformed.id, data)
|
||||
for m in media:
|
||||
insert(m)
|
||||
|
||||
|
||||
|
||||
insert(m)
|
||||
34
tests/transformer/telegram_telethon.py
Normal file
34
tests/transformer/telegram_telethon.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import TelegramTelethonScraper
|
||||
from cisticola.transformer import TelegramTelethonTransformer
|
||||
from cisticola.base import Post, Media
|
||||
|
||||
@pytest.mark.media
|
||||
def test_scrape_etl_telegram_telethon(engine, controller, etl_controller, channel_kwargs):
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['telegram'])]
|
||||
controller.register_scraper(scraper = TelegramTelethonScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
|
||||
etl_controller.register_transformer(TelegramTelethonTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
session = sessionfactory()
|
||||
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 19
|
||||
assert len(media) == 13
|
||||
|
||||
assert posts[16].content == "Taking pre-orders now"
|
||||
assert json.loads(media[0].exif)['Composite:ImageSize'] == "1028 1280"
|
||||
@@ -1,4 +1,4 @@
|
||||
from sqlalchemy.orm import sessionmaker, with_polymorphic
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import json
|
||||
|
||||
import pytest
|
||||
@@ -18,6 +18,7 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
|
||||
etl_controller.register_transformer(TwitterTransformer())
|
||||
etl_controller.transform_all_untransformed()
|
||||
etl_controller.transform_all_untransformed_info()
|
||||
|
||||
sessionfactory = sessionmaker()
|
||||
sessionfactory.configure(bind=engine)
|
||||
@@ -26,8 +27,8 @@ def test_scrape_etl_twitter(engine, controller, etl_controller, channel_kwargs):
|
||||
posts = session.query(Post).all()
|
||||
media = session.query(Media).all()
|
||||
|
||||
assert len(posts) == 10
|
||||
assert len(media) == 7
|
||||
assert len(posts) == 12
|
||||
assert len(media) == 4
|
||||
|
||||
assert posts[-1].content == "BARN"
|
||||
assert json.loads(media[-1].exif)['Composite:ImageSize'] == "826 728"
|
||||
assert posts[2].content == "BARN"
|
||||
assert json.loads(media[0].exif)['Composite:ImageSize'] == "826 728"
|
||||
Reference in New Issue
Block a user