mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Fix bug with Telethon scraper and certain media; add media_archived flag to TwitterScraper
This commit is contained in:
@@ -66,13 +66,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
return result
|
||||
|
||||
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
||||
logger.debug(f"Archiving post {post}")
|
||||
|
||||
if post.media is None:
|
||||
logger.debug("No media for post")
|
||||
return None, None
|
||||
|
||||
logger.debug(f"Archiving media {post.media}")
|
||||
|
||||
if client is None:
|
||||
api_id = os.environ['TELEGRAM_API_ID']
|
||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||
@@ -81,6 +78,11 @@ class TelegramTelethonScraper(Scraper):
|
||||
with TelegramClient(phone, api_id, api_hash) as client:
|
||||
return self.archive_post_media(post, client=client)
|
||||
|
||||
if type(post.media) == types.MessageMediaDocument:
|
||||
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
||||
else:
|
||||
logger.debug(f"Archiving {type(post.media)}")
|
||||
|
||||
key = f'{post.peer_id.channel_id}_{post.id}'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@@ -88,6 +90,10 @@ class TelegramTelethonScraper(Scraper):
|
||||
|
||||
client.download_media(post.media, output_file)
|
||||
|
||||
if len(os.listdir(temp_dir)) == 0:
|
||||
logger.warning(f"No file present. Could not archive {post.media}")
|
||||
return None, None
|
||||
|
||||
output_file_with_ext = os.listdir(temp_dir)[0]
|
||||
filename = Path(temp_dir, output_file_with_ext)
|
||||
|
||||
@@ -96,7 +102,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
return (blob, output_file_with_ext)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
if channel.platform == "Telegram" and channel.public:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
@@ -110,14 +116,13 @@ class TelegramTelethonScraper(Scraper):
|
||||
for post in client.iter_messages(username):
|
||||
post_url = f'{channel.url}/{post.id}'
|
||||
|
||||
logger.info(f"Archiving post {post_url} from {post.date}")
|
||||
logger.trace(f"Archiving post {post_url} from {post.date}")
|
||||
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
logger.info(f"Archiving post {post_url}")
|
||||
|
||||
if post.media is not None:
|
||||
archived_urls[post_url] = None
|
||||
|
||||
Reference in New Issue
Block a user