Merge pull request #21 from bellingcat/resolve-telethon-issues

Resolve telethon issues
This commit is contained in:
Logan Williams
2022-04-26 10:13:14 +02:00
committed by GitHub
4 changed files with 16 additions and 7 deletions

View File

@@ -8,6 +8,8 @@ from urllib.parse import urlparse
import hashlib
import time
import requests
from loguru import logger
from selenium.common.exceptions import TimeoutException
from storages import Storage
from utils import mkdir_if_not_exists
@@ -46,7 +48,7 @@ class Archiver(ABC):
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
page = f'''<html><head><title>{url}</title></head>
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
<body>
<h2>Archived media from {self.name}</h2>
<h3><a href="{url}">{url}</a></h3><ul>'''
@@ -98,6 +100,7 @@ class Archiver(ABC):
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension
@@ -125,8 +128,11 @@ class Archiver(ABC):
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
filename = 'tmp/' + key
self.driver.get(url)
time.sleep(6)
try:
self.driver.get(url)
time.sleep(6)
except TimeoutException:
logger.info("TimeoutException loading page for screenshot")
self.driver.save_screenshot(filename)
self.storage.upload(filename, key, extra_args={
@@ -172,7 +178,7 @@ class Archiver(ABC):
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
index_page = f'''<html><head><title>{filename}</title></head>
index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
<body>'''
for t in cdn_urls:

View File

@@ -2,7 +2,6 @@ import os
import re
import html
from dataclasses import dataclass
from urllib.parse import urlparse
from loguru import logger
from storages import Storage
@@ -38,7 +37,7 @@ class TelethonArchiver(Archiver):
posts = self.client.get_messages(chat, ids=search_ids)
media = []
for post in posts:
if post.grouped_id == original_post.grouped_id and post.media is not None:
if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None:
media.append(post)
return media
@@ -76,7 +75,7 @@ class TelethonArchiver(Archiver):
uploaded_media = []
message = post.message
for mp in media_posts:
if len(mp.message) > message: message = mp.message
if len(mp.message) > len(message): message = mp.message
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
key = filename.split('tmp/')[1]
self.storage.upload(filename, key)

View File

@@ -78,8 +78,11 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
options = webdriver.FirefoxOptions()
options.headless = True
options.set_preference('network.protocol-handler.external.tg', False)
driver = webdriver.Firefox(options=options)
driver.set_window_size(1400, 2000)
driver.set_page_load_timeout(10)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):

View File

@@ -20,6 +20,7 @@ class S3Storage(Storage):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
self.private = config.private
if len(self.folder) and self.folder[-1] != '/':
self.folder += '/'