Add some more fields to media DB, fix bugs in testing

This commit is contained in:
Logan Williams
2022-07-05 11:11:43 +02:00
parent 4ddd8d6b63
commit 6149c4279d
10 changed files with 558 additions and 390 deletions

View File

@@ -29,6 +29,7 @@ ocrd-pyexiftool = "*"
gabber = {git = "https://github.com/stanfordio/gabber.git"}
snscrape = {git = "https://github.com/bellingcat/snscrape"}
polyphemus = {git = "https://github.com/bellingcat/polyphemus"}
filelock = "*"
[dev-packages]
pytest = "*"

862
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -326,6 +326,24 @@ class Media:
#: Original URL of the media from the the original post.
original_url: str
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
scraper: str
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
transformer: str
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
platform: str
#: Datetime (relative to UTC) that the scraped post was created at.
date: datetime
#: Datetime (relative to UTC) that the scraped post was archived at.
date_archived: datetime
#: Datetime (UTC) that the scraped post was transformed at.
date_transformed: datetime
#: JSON dump of the dict containing metadata information for the media file.
exif: str = None
@@ -497,7 +515,13 @@ media_table = Table('media', mapper_registry.metadata,
Column('url', String),
Column('original_url', String),
Column('exif', String),
Column('ocr', String))
Column('ocr', String),
Column('date', DateTime, index=True),
Column('date_archived', DateTime, index=True),
Column('date_transformed', DateTime, index=True),
Column('scraper', String),
Column('transformer', String)
)
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)

View File

@@ -421,6 +421,9 @@ class ScraperController:
session.commit()
added += 1
if added > 100:
break
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")

View File

@@ -4,9 +4,9 @@ from sqlalchemy.orm import sessionmaker, make_transient
from sqlalchemy.engine.base import Engine
from sqlalchemy.sql.expression import func
from collections import defaultdict
from datetime import datetime
from datetime import datetime, timezone
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry, Image, Video, Audio
class Transformer:
@@ -50,6 +50,24 @@ class Transformer:
pass
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable):
'''Transform media'''
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
filename = archived_url.split('/')[-1]
ext = None if '.' not in filename else filename.split('.')[-1].lower()
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
else:
logger.warning(f"Unknown file extension {ext}")
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
class ETLController:
"""An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format
@@ -130,7 +148,8 @@ class ETLController:
logger.info(f"Found matching DB entry for {obj}: {instance}")
return instance
if hydrate:
# Don't hydrate videos, because they can be quite large and this is time consuming
if hydrate and type(obj) != Video:
obj.hydrate()
session.add(obj)
@@ -294,13 +313,13 @@ class ETLController:
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
handled = True
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate), session)
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate))
session.commit()
break
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
if handled == False:
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
@logger.catch(reraise=True)
def transform_all_untransformed_media(self, hydrate=True):

View File

@@ -21,13 +21,13 @@ class BitchuteTransformer(Transformer):
return False
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable) -> Generator[Media, None, None]:
raw = json.loads(data.raw_data)
orig = raw['video_url']
new = data.archived_urls[orig]
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
insert(m)

View File

@@ -49,7 +49,7 @@ class GettrTransformer(Transformer):
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first()
if channel is None:
try:

View File

@@ -90,7 +90,7 @@ def _process_number(s):
if s is None:
return None
else:
s = s.replace(' ', '')
s = s.replace(' ', '').replace(',','')
if s.endswith('M'):
return int(float(s[:-1]) * 1e6)
elif s.endswith('K'):

View File

@@ -121,23 +121,6 @@ class TelegramTelethonTransformer(Transformer):
transformed = insert(transformed)
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable, session):
for k in data.archived_urls:
if data.archived_urls[k]:
archived_url = data.archived_urls[k]
filename = archived_url.split('/')[-1]
ext = None if '.' not in filename else filename.split('.')[-1].lower()
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
else:
logger.warning(f"Unknown file extension {ext}")
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
raw = json.loads(data.raw_data)

0
spacy_setup.sh Normal file → Executable file
View File