mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Add some more fields to media DB, fix bugs in testing
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -29,6 +29,7 @@ ocrd-pyexiftool = "*"
|
||||
gabber = {git = "https://github.com/stanfordio/gabber.git"}
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape"}
|
||||
polyphemus = {git = "https://github.com/bellingcat/polyphemus"}
|
||||
filelock = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
862
Pipfile.lock
generated
862
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -326,6 +326,24 @@ class Media:
|
||||
#: Original URL of the media from the the original post.
|
||||
original_url: str
|
||||
|
||||
#: String specifying name and version of scraper used to generate result, e.g. ``"TwitterScraper 0.0.1"``.
|
||||
scraper: str
|
||||
|
||||
#: String specifying name and version of transformer used to tranform result, e.g. ``"TwitterTransformer 0.0.1"``.
|
||||
transformer: str
|
||||
|
||||
#: Name of platform from which result was scraped, e.g. ``"Twitter"``.
|
||||
platform: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was created at.
|
||||
date: datetime
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
|
||||
#: Datetime (UTC) that the scraped post was transformed at.
|
||||
date_transformed: datetime
|
||||
|
||||
#: JSON dump of the dict containing metadata information for the media file.
|
||||
exif: str = None
|
||||
|
||||
@@ -497,7 +515,13 @@ media_table = Table('media', mapper_registry.metadata,
|
||||
Column('url', String),
|
||||
Column('original_url', String),
|
||||
Column('exif', String),
|
||||
Column('ocr', String))
|
||||
Column('ocr', String),
|
||||
Column('date', DateTime, index=True),
|
||||
Column('date_archived', DateTime, index=True),
|
||||
Column('date_transformed', DateTime, index=True),
|
||||
Column('scraper', String),
|
||||
Column('transformer', String)
|
||||
)
|
||||
|
||||
mapper_registry.map_imperatively(Post, post_table)
|
||||
mapper_registry.map_imperatively(Channel, channel_table)
|
||||
|
||||
@@ -421,6 +421,9 @@ class ScraperController:
|
||||
session.commit()
|
||||
added += 1
|
||||
|
||||
if added > 100:
|
||||
break
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
|
||||
@@ -4,9 +4,9 @@ from sqlalchemy.orm import sessionmaker, make_transient
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.sql.expression import func
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry
|
||||
from cisticola.base import RawChannelInfo, ChannelInfo, ScraperResult, Post, Media, Channel, mapper_registry, Image, Video, Audio
|
||||
|
||||
|
||||
class Transformer:
|
||||
@@ -50,6 +50,24 @@ class Transformer:
|
||||
|
||||
pass
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable):
|
||||
'''Transform media'''
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
filename = archived_url.split('/')[-1]
|
||||
ext = None if '.' not in filename else filename.split('.')[-1].lower()
|
||||
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
|
||||
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
else:
|
||||
logger.warning(f"Unknown file extension {ext}")
|
||||
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform))
|
||||
|
||||
|
||||
class ETLController:
|
||||
"""An ETLController will transform raw scraped data (ScrapedResult objects) into a more detailed format
|
||||
@@ -130,7 +148,8 @@ class ETLController:
|
||||
logger.info(f"Found matching DB entry for {obj}: {instance}")
|
||||
return instance
|
||||
|
||||
if hydrate:
|
||||
# Don't hydrate videos, because they can be quite large and this is time consuming
|
||||
if hydrate and type(obj) != Video:
|
||||
obj.hydrate()
|
||||
|
||||
session.add(obj)
|
||||
@@ -294,13 +313,13 @@ class ETLController:
|
||||
logger.trace(f"{transformer} is handling result {result.id} ({result.date})")
|
||||
handled = True
|
||||
|
||||
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate), session)
|
||||
transformer.transform_media(result, total_result.Post, lambda obj: self.insert_or_select(obj, session, hydrate))
|
||||
|
||||
session.commit()
|
||||
break
|
||||
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle ID {result.id} with platform {result.platform} ({result.date})")
|
||||
|
||||
@logger.catch(reraise=True)
|
||||
def transform_all_untransformed_media(self, hydrate=True):
|
||||
|
||||
@@ -21,13 +21,13 @@ class BitchuteTransformer(Transformer):
|
||||
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, insert: Callable, transformed: Post) -> Generator[Media, None, None]:
|
||||
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig)
|
||||
m = Video(url=new, post=transformed.id, raw_id=data.id, original_url=orig, date=data.date, date_archived=data.date_archived, date_transformed=datetime.now(timezone.utc), transformer=self.__version__, scraper=data.scraper, platform=data.platform)
|
||||
|
||||
insert(m)
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ class GettrTransformer(Transformer):
|
||||
|
||||
def _get_channel_id(self, username: str, category: str, insert: Callable, session):
|
||||
|
||||
channel = session.query(Channel).filter(func.lower(Channel.screenname)==func.lower(username), platform = 'Gettr').first()
|
||||
channel = session.query(Channel).where((func.lower(Channel.screenname)==func.lower(username)) & (Channel.platform == 'Gettr')).first()
|
||||
|
||||
if channel is None:
|
||||
try:
|
||||
|
||||
@@ -90,7 +90,7 @@ def _process_number(s):
|
||||
if s is None:
|
||||
return None
|
||||
else:
|
||||
s = s.replace(' ', '')
|
||||
s = s.replace(' ', '').replace(',','')
|
||||
if s.endswith('M'):
|
||||
return int(float(s[:-1]) * 1e6)
|
||||
elif s.endswith('K'):
|
||||
|
||||
@@ -121,23 +121,6 @@ class TelegramTelethonTransformer(Transformer):
|
||||
|
||||
transformed = insert(transformed)
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post, insert: Callable, session):
|
||||
for k in data.archived_urls:
|
||||
if data.archived_urls[k]:
|
||||
archived_url = data.archived_urls[k]
|
||||
filename = archived_url.split('/')[-1]
|
||||
ext = None if '.' not in filename else filename.split('.')[-1].lower()
|
||||
|
||||
if ext == 'mp4' or ext == 'mov' or ext == 'avi' or ext =='mkv':
|
||||
insert(Video(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
elif ext == 'oga' or ext == 'mp3' or ext == "wav" or ext == 'aif' or ext == 'aiff' or ext == 'aac':
|
||||
insert(Audio(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
elif ext == 'jpg' or ext == 'jpeg' or ext == 'png' or ext == 'gif' or ext == 'bmp' or ext == 'heic' or ext == 'tiff':
|
||||
insert(Image(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
else:
|
||||
logger.warning(f"Unknown file extension {ext}")
|
||||
insert(Media(url=archived_url, post=transformed.id, raw_id=data.id, original_url=k))
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable, session) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
|
||||
0
spacy_setup.sh
Normal file → Executable file
0
spacy_setup.sh
Normal file → Executable file
Reference in New Issue
Block a user