From 398f296789ad4c38c39ab9473e925c72a40fe718 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Fri, 18 Mar 2022 11:10:27 +0100 Subject: [PATCH 1/5] Fix Selenium driver issues with telegram links --- archivers/base_archiver.py | 10 ++++++++-- archivers/telethon_archiver.py | 2 +- auto_archive.py | 6 +++++- storages/s3_storage.py | 1 + 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 687e762..b32bee0 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -8,6 +8,8 @@ from urllib.parse import urlparse import hashlib import time import requests +from loguru import logger +from selenium.common.exceptions import TimeoutException from storages import Storage from utils import mkdir_if_not_exists @@ -54,6 +56,7 @@ class Archiver(ABC): for url_info in urls_info: page += f'''
  • {url_info['key']}: {url_info['hash']}
  • ''' + # TODO/ISSUE: character encoding is incorrect for Cyrillic, produces garbled text page += f"

    {self.name} object data:

    {object}" page += f"" @@ -125,8 +128,11 @@ class Archiver(ABC): "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png") filename = 'tmp/' + key - self.driver.get(url) - time.sleep(6) + try: + self.driver.get(url) + time.sleep(6) + except TimeoutException: + logger.info("TimeoutException loading page for screenshot") self.driver.save_screenshot(filename) self.storage.upload(filename, key, extra_args={ diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index c332854..f7ee53e 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -76,7 +76,7 @@ class TelethonArchiver(Archiver): uploaded_media = [] message = post.message for mp in media_posts: - if len(mp.message) > message: message = mp.message + if len(mp.message) > len(message): message = mp.message filename = self.client.download_media(mp.media, f'tmp/{chat}_{group_id}/{mp.id}') key = filename.split('tmp/')[1] self.storage.upload(filename, key) diff --git a/auto_archive.py b/auto_archive.py index 1f00b2f..fea9bfb 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -78,8 +78,12 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): options = webdriver.FirefoxOptions() options.headless = True - driver = webdriver.Firefox(options=options) + profile = webdriver.FirefoxProfile() + profile.set_preference('network.protocol-handler.external.tg', False) + + driver = webdriver.Firefox(profile, options=options) driver.set_window_size(1400, 2000) + driver.set_page_load_timeout(10) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): diff --git a/storages/s3_storage.py b/storages/s3_storage.py index 53bb151..d7c9644 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -20,6 +20,7 @@ class S3Storage(Storage): self.bucket = config.bucket self.region = config.region self.folder = config.folder + self.private = config.private if len(self.folder) and self.folder[-1] != '/': self.folder += '/' From 576f1a8f687199cf38864f7271b9a63e65de8692 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 30 Mar 2022 10:55:33 +0200 Subject: [PATCH 2/5] fix the UTF-8 issue for cyrilic --- archivers/base_archiver.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index b32bee0..e1e9194 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -48,7 +48,7 @@ class Archiver(ABC): return self.get_key(urlparse(url).path.replace("/", "_") + ".html") def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): - page = f'''{url} + page = f'''{url}

    Archived media from {self.name}

    {url}

    {self.name} object data:

    {object}" page += f"" From 3bdeec1d2f3f17b8cd36ecf28a92a81a65a00e6f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 30 Mar 2022 11:05:31 +0200 Subject: [PATCH 4/5] fix deprecation warning for selenium --- archivers/telethon_archiver.py | 1 - auto_archive.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index f7ee53e..13b2a87 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -2,7 +2,6 @@ import os import re import html from dataclasses import dataclass -from urllib.parse import urlparse from loguru import logger from storages import Storage diff --git a/auto_archive.py b/auto_archive.py index fea9bfb..d3db9a2 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -78,10 +78,9 @@ def process_sheet(sheet, header=1, columns=GWorksheet.COLUMN_NAMES): options = webdriver.FirefoxOptions() options.headless = True - profile = webdriver.FirefoxProfile() - profile.set_preference('network.protocol-handler.external.tg', False) + options.set_preference('network.protocol-handler.external.tg', False) - driver = webdriver.Firefox(profile, options=options) + driver = webdriver.Firefox(options=options) driver.set_window_size(1400, 2000) driver.set_page_load_timeout(10) From 8358ab0bfc4db0e318caf421b1d232b925e64708 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 30 Mar 2022 11:12:06 +0200 Subject: [PATCH 5/5] assert post is not None --- archivers/telethon_archiver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index 13b2a87..88bec58 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -37,7 +37,7 @@ class TelethonArchiver(Archiver): posts = self.client.get_messages(chat, ids=search_ids) media = [] for post in posts: - if post.grouped_id == original_post.grouped_id and post.media is not None: + if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None: media.append(post) return media