mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Merge pull request #21 from bellingcat/resolve-telethon-issues
Resolve telethon issues
This commit is contained in:
@@ -8,6 +8,8 @@ from urllib.parse import urlparse
|
||||
import hashlib
|
||||
import time
|
||||
import requests
|
||||
from loguru import logger
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
|
||||
from storages import Storage
|
||||
from utils import mkdir_if_not_exists
|
||||
@@ -46,7 +48,7 @@ class Archiver(ABC):
|
||||
return self.get_key(urlparse(url).path.replace("/", "_") + ".html")
|
||||
|
||||
def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
|
||||
page = f'''<html><head><title>{url}</title></head>
|
||||
page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
|
||||
<body>
|
||||
<h2>Archived media from {self.name}</h2>
|
||||
<h3><a href="{url}">{url}</a></h3><ul>'''
|
||||
@@ -98,6 +100,7 @@ class Archiver(ABC):
|
||||
uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
|
||||
|
||||
return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||
@@ -125,8 +128,11 @@ class Archiver(ABC):
|
||||
"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
|
||||
filename = 'tmp/' + key
|
||||
|
||||
self.driver.get(url)
|
||||
time.sleep(6)
|
||||
try:
|
||||
self.driver.get(url)
|
||||
time.sleep(6)
|
||||
except TimeoutException:
|
||||
logger.info("TimeoutException loading page for screenshot")
|
||||
|
||||
self.driver.save_screenshot(filename)
|
||||
self.storage.upload(filename, key, extra_args={
|
||||
@@ -172,7 +178,7 @@ class Archiver(ABC):
|
||||
|
||||
key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
|
||||
|
||||
index_page = f'''<html><head><title>{filename}</title></head>
|
||||
index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
|
||||
<body>'''
|
||||
|
||||
for t in cdn_urls:
|
||||
|
||||
@@ -2,7 +2,6 @@ import os
|
||||
import re
|
||||
import html
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from storages import Storage
|
||||
@@ -38,7 +37,7 @@ class TelethonArchiver(Archiver):
|
||||
posts = self.client.get_messages(chat, ids=search_ids)
|
||||
media = []
|
||||
for post in posts:
|
||||
if post.grouped_id == original_post.grouped_id and post.media is not None:
|
||||
if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None:
|
||||
media.append(post)
|
||||
return media
|
||||
|
||||
@@ -76,7 +75,7 @@ class TelethonArchiver(Archiver):
|
||||
uploaded_media = []
|
||||
message = post.message
|
||||
for mp in media_posts:
|
||||
if len(mp.message) > message: message = mp.message
|
||||
if len(mp.message) > len(message): message = mp.message
|
||||
filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}')
|
||||
key = filename.split('tmp/')[1]
|
||||
self.storage.upload(filename, key)
|
||||
|
||||
@@ -78,8 +78,11 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES):
|
||||
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.headless = True
|
||||
options.set_preference('network.protocol-handler.external.tg', False)
|
||||
|
||||
driver = webdriver.Firefox(options=options)
|
||||
driver.set_window_size(1400, 2000)
|
||||
driver.set_page_load_timeout(10)
|
||||
|
||||
# loop through worksheets to check
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
|
||||
@@ -20,6 +20,7 @@ class S3Storage(Storage):
|
||||
self.bucket = config.bucket
|
||||
self.region = config.region
|
||||
self.folder = config.folder
|
||||
self.private = config.private
|
||||
|
||||
if len(self.folder) and self.folder[-1] != '/':
|
||||
self.folder += '/'
|
||||
|
||||
Reference in New Issue
Block a user