Merge pull request #19 from bellingcat/separate-media-archiving

Separate media archiving
This commit is contained in:
Tristan Lee
2022-03-28 20:20:57 -05:00
committed by GitHub
7 changed files with 204 additions and 120 deletions

View File

@@ -21,6 +21,7 @@ pytesseract = "*"
pyexiftool = {git = "https://github.com/smarnach/pyexiftool.git"}
instaloader = "*"
gspread = "*"
cryptg = "*"
[dev-packages]
pytest = "*"

View File

@@ -41,19 +41,10 @@ class ScraperResult:
#: Dict in which the keys are the original media URLs from the post, and the corresponding values are the URLs of the archived media files.
archived_urls: dict
raw_data_table = Table('raw_data', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
#: Has the media in this post been archived?
media_archived: bool
@dataclass
class Channel:
"""Information about a specific channel to be scraped.
@@ -98,24 +89,6 @@ class Channel:
def hydrate(self):
pass
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('category', String),
Column('platform', String),
Column('url', String),
Column('screenname', String),
Column('country', String),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
)
mapper_registry.map_imperatively(Channel, channel_table)
@dataclass
class Post:
"""An object with fields for columns in the analysis table"""
@@ -165,26 +138,6 @@ class Post:
def hydrate(self):
pass
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer, ForeignKey('channels.id')),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id')),
Column('reply_to', Integer, ForeignKey('posts.id'))
)
mapper_registry.map_imperatively(Post, post_table)
@dataclass
class Media:
@@ -273,28 +226,48 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
autoincrement=True),
Column('scraper', String),
Column('platform', String),
Column('channel', Integer),
Column('channel', Integer, ForeignKey('channels.id')),
Column('platform_id', String),
Column('date', DateTime),
Column('raw_data', String),
Column('date_archived', DateTime),
Column('archived_urls', JSON))
Column('archived_urls', JSON),
Column('media_archived', Boolean))
channel_table = Table('channels', mapper_registry.metadata,
Column('id', Integer, primary_key=True, autoincrement=True),
Column('name', String),
Column('platform_id', Integer),
Column('category', String),
Column('platform', String),
Column('url', String),
Column('screenname', String),
Column('country', String),
Column('influencer', String),
Column('public', Boolean),
Column('chat', Boolean),
Column('notes', String),
Column('source', String)
)
analysis_table = Table('analysis', mapper_registry.metadata,
post_table = Table('posts', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
autoincrement=True),
Column('raw_id', Integer, ForeignKey('raw_data.id')),
Column('platform_id', Integer),
Column('scraper', String),
Column('transformer', String),
Column('platform', String),
Column('channel', Integer),
Column('channel', Integer, ForeignKey('channels.id')),
Column('date', DateTime),
Column('date_archived', DateTime),
Column('url', String),
Column('author_id', String),
Column('author_username', String),
Column('content', String))
Column('content', String),
Column('forwarded_from', Integer, ForeignKey('channels.id')),
Column('reply_to', Integer, ForeignKey('posts.id'))
)
media_table = Table('media', mapper_registry.metadata,
Column('id', Integer, primary_key=True,
@@ -307,7 +280,8 @@ media_table = Table('media', mapper_registry.metadata,
Column('exif', String),
Column('ocr', String))
mapper_registry.map_imperatively(TransformedResult, analysis_table)
mapper_registry.map_imperatively(Post, post_table)
mapper_registry.map_imperatively(Channel, channel_table)
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
mapper_registry.map_imperatively(Media, media_table, polymorphic_on='type', polymorphic_identity='media')
mapper_registry.map_imperatively(Image, media_table, inherits=Media, polymorphic_on='type', polymorphic_identity='image')

View File

@@ -234,6 +234,17 @@ class Scraper:
return archived_url
def archive_files(self, result: ScraperResult) -> ScraperResult:
for url in result.archived_urls:
if result.archived_urls[url] is None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
result.archived_urls[url] = archived_url
result.media_archived = True
return result
def can_handle(self, channel: Channel) -> bool:
"""Whether or not the scraper can scrape the specified channel.
@@ -353,6 +364,38 @@ class ScraperController:
if not handled:
logger.warning(f"No handler found for Channel {channel}")
@logger.catch(reraise = True)
def archive_unarchived_media(self):
if self.session is None:
logger.error("No DB session")
return
session = self.session()
posts = session.query(ScraperResult).where(ScraperResult.media_archived == False).all()
logger.info(f"Found {len(posts)} posts without media. Archiving now")
for post in posts:
handled = False
for scraper in self.scrapers:
if scraper.__version__ == post.scraper:
handled = True
logger.info(f"{scraper} is archiving media for {post}")
post = scraper.archive_files(post)
if post:
session.query(ScraperResult).where(ScraperResult.id == post.id).update({'archived_urls': post.archived_urls, 'media_archived': True})
session.commit()
break
if not handled:
logger.warning(f"No handler found for post scraped with {post.scraper}")
session.commit()
def connect_to_db(self, engine):
"""Connect the specified SQLAlchemy engine to the controller.
"""

View File

@@ -30,19 +30,17 @@ class TelegramSnscrapeScraper(Scraper):
archived_urls = {}
for image_url in post.images:
archived_urls[image_url] = None
if post.video:
archived_urls[post.video] = None
if archive_media:
for image_url in post.images:
logger.debug(f'Archiving image: {image_url}')
media_blob, content_type, key = self.url_to_blob(image_url)
for url in archived_urls:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[image_url] = archived_url
if post.video:
logger.debug(f'Archiving video: {post.video}')
media_blob, content_type, key = self.url_to_blob(post.video)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post.video] = archived_url
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -52,5 +50,6 @@ class TelegramSnscrapeScraper(Scraper):
date=post.date,
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls
archived_urls=archived_urls,
media_archived=archive_media
)

View File

@@ -4,9 +4,11 @@ import os
import json
import tempfile
from pathlib import Path
import time
from loguru import logger
from telethon.sync import TelegramClient
from telethon.tl import types
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
@@ -23,12 +25,80 @@ class TelegramTelethonScraper(Scraper):
username = username.split('s/')[1]
return username
def archive_files(self, result: ScraperResult, client : TelegramClient = None) -> ScraperResult:
if len(result.archived_urls.keys()) == 0:
return result
if client is None:
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
return self.archive_files(result, client)
if len(list(result.archived_urls.keys())) != 1:
logger.warning(f"Expected 1 key in archived_urls, found {result.archived_keys}")
else:
key = list(result.archived_urls.keys())[0]
if result.archived_urls[key] is None:
raw = json.loads(result.raw_data)
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
blob = None
if len(message) > 0:
blob, output_file_with_ext = self.archive_post_media(message[0], client)
else:
logger.warning("No message retrieved")
if blob is not None:
# TODO specify Content-Type
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
result.archived_urls[key] = archived_url
return result
else:
logger.warning("Downloaded blob was None")
result.media_archived = True
return result
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
logger.debug(f"Archiving post {post}")
if post.media is None:
return None, None
logger.debug(f"Archiving media {post.media}")
if client is None:
api_id = os.environ['TELEGRAM_API_ID']
api_hash = os.environ['TELEGRAM_API_HASH']
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
return self.archive_post_media(post, client=client)
key = f'{post.peer_id.channel_id}_{post.id}'
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key)
client.download_media(post.media, output_file)
output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext)
with open(filename, 'rb') as f:
blob = f.read()
return (blob, output_file_with_ext)
def can_handle(self, channel):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
api_id = os.environ['TELEGRAM_API_ID']
@@ -36,34 +106,27 @@ class TelegramTelethonScraper(Scraper):
phone = os.environ['TELEGRAM_PHONE']
with TelegramClient(phone, api_id, api_hash) as client:
for post in client.iter_messages(username):
post_url = f'{channel.url}/{post.id}'
logger.info(f"Archiving post {post_url} from {post.date}")
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
break
post_url = f'{channel.url}/{post.id}'
key = f'{username}_{post.id}'
archived_urls = {}
logger.info(f"Archiving post {post_url}")
if archive_media:
if post.media is not None:
archived_urls[post_url] = None
if post.media is not None:
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir, key)
client.download_media(post.media, output_file)
output_file_with_ext = os.listdir(temp_dir)[0]
filename = Path(temp_dir, output_file_with_ext)
with open(filename, 'rb') as f:
blob = f.read()
# TODO specify Content-Type
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
archived_urls[post_url] = archived_url
if archive_media:
blob, output_file_with_ext = self.archive_post_media(post, client)
if blob is not None:
# TODO specify Content-Type
archived_url = self.archive_blob(blob = blob, content_type = '', key = output_file_with_ext)
archived_urls[post_url] = archived_url
yield ScraperResult(
scraper=self.__version__,
@@ -73,4 +136,5 @@ class TelegramTelethonScraper(Scraper):
date=post.date.replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=json.dumps(post.to_dict(), default=str),
archived_urls=archived_urls)
archived_urls=archived_urls,
media_archived=archive_media)

View File

@@ -28,31 +28,33 @@ class TwitterScraper(Scraper):
archived_urls = {}
if archive_media:
media_list = []
if tweet.media:
media_list += tweet.media
media_list = []
if tweet.media:
media_list += tweet.media
if tweet.retweetedTweet and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.retweetedTweet and tweet.retweetedTweet.media:
media_list += tweet.retweetedTweet.media
if tweet.quotedTweet and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
if tweet.quotedTweet and tweet.quotedTweet.media:
media_list += tweet.quotedTweet.media
for media in media_list:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
for media in media_list:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
url = variant.url
elif type(media) == Gif:
url = media.variants[0].url
elif type(media) == Photo:
url = media.fullUrl
else:
logger.warning(f"Could not get media URL of {media}")
url = None
if url is not None and url not in archived_urls:
if url is not None and url not in archived_urls:
archived_urls[url] = None
if archive_media:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url

21
test.py
View File

@@ -28,7 +28,6 @@ scrapers = [
GettrScraper(),
OdyseeScraper(),
RumbleScraper(),
TelegramSnscrapeScraper(),
TelegramTelethonScraper(),
TwitterScraper()]
@@ -43,15 +42,15 @@ session = session_generator()
gc = gspread.service_account(filename='service_account.json')
# Open a sheet from a spreadsheet in one go
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1yxd6-2Mp0jZ8r9XJklb39WE-iIMrKRyA2kymJcIfGis/edit#gid=0")
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1k5VgqREoA3v1r7bkVq7TOTRDtdYqTMWkQnsZpRbntpw/edit#gid=0")
channels = wks.worksheet("channels").get_all_records()
for c in channels:
del c['followers']
for k in c.keys():
if c[k] == 'TRUE': c[k] = True
if c[k] == 'FALSE': c[k] = False
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
# check to see if this already exists,
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
@@ -63,11 +62,13 @@ for c in channels:
session.commit()
controller.connect_to_db(engine)
controller.scrape_all_channels(archive_media = True)
controller.scrape_all_channels(archive_media = False)
transformer = TwitterTransformer()
controller.archive_unarchived_media()
etl_controller = ETLController()
etl_controller.register_transformer(transformer)
etl_controller.connect_to_db(engine)
etl_controller.transform_all_untransformed()
# transformer = TwitterTransformer()
# etl_controller = ETLController()
# etl_controller.register_transformer(transformer)
# etl_controller.connect_to_db(engine)
# etl_controller.transform_all_untransformed()