diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 8951115..902f626 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -26,15 +26,14 @@ class ArchiveResult: screenshot: str = None hash: str = None - class Archiver(ABC): + HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver, hash_algorithm): + def __init__(self, storage: Storage, driver): self.storage = storage self.driver = driver - self.hash_algorithm = hash_algorithm def __str__(self): return self.__class__.__name__ @@ -48,7 +47,6 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): """ Generates an index.html page where each @urls_info is displayed @@ -164,12 +162,11 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - ha = self.hash_algorithm - logger.debug(f'Hash algorithm is {ha}') + logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') - if ha == "SHA3_512": hash = hashlib.sha3_512(bytes) - elif ha == "SHA256": hash = hashlib.sha256(bytes) - else: raise Exception("Unknown Hash Algorithm of {ha}") + if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) + elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") return hash.hexdigest() diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index c38dd30..0b6e777 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,9 +11,6 @@ from storages import Storage class TelegramArchiver(Archiver): name = "telegram" - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) - def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 't.me' != self.get_netloc(url): diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bce34d2..f35e323 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -15,8 +15,8 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: TelethonConfig): + super().__init__(storage, driver) if config: self.client = TelegramClient("./anon", config.api_id, config.api_hash) self.bot_token = config.bot_token diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 30b8c7c..8100bb1 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -15,9 +15,6 @@ class TiktokArchiver(Archiver): status = 'success' - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) - try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 99fb8f1..6aa1742 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: TwitterApiConfig): + super().__init__(storage, driver) if config.bearer_token: self.api = Api(bearer_token=config.bearer_token) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 750d2c4..8f646fd 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -5,15 +5,11 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult -from storages import Storage - class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods, and it works as an alternative to TwitterApiArchiver when no API credentials are provided. """ - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 6ddba10..c448367 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -17,8 +17,8 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: VkConfig): + super().__init__(storage, driver) if config != None: self.vks = VkScraper(config.username, config.password) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index c19ca4f..f46d1cb 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -15,8 +15,8 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm): - super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: WaybackConfig): + super(WaybackArchiver, self).__init__(storage, driver) self.config = config self.seen_urls = {} diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index a41b6c6..7990131 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, fb_cookie): + super().__init__(storage, driver) self.fb_cookie = fb_cookie def download(self, url, check_if_exists=False): diff --git a/auto_archive.py b/auto_archive.py index 72cb748..f12b9c4 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -104,14 +104,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm), - TiktokArchiver(storage, c.webdriver, c.hash_algorithm), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm), - TelegramArchiver(storage, c.webdriver, c.hash_algorithm), - TwitterArchiver(storage, c.webdriver, c.hash_algorithm), - VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm), - WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm) + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TiktokArchiver(storage, c.webdriver), + TwitterApiArchiver(storage, c.webdriver, c.twitter_config), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TelegramArchiver(storage, c.webdriver), + TwitterArchiver(storage, c.webdriver), + VkArchiver(storage, c.webdriver, c.vk_config), + WaybackArchiver(storage, c.webdriver, c.wayback_config) ] for archiver in active_archivers: diff --git a/configs/config.py b/configs/config.py index 1169048..0d11467 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,6 @@ import argparse, yaml, json +from archivers.base_archiver import Archiver import gspread from loguru import logger from selenium import webdriver @@ -81,7 +82,7 @@ class Config: ) self.webdriver = "not initialized" - self.hash_algorithm = execution.get("hash_algorithm") + Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -262,6 +263,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, + "hash_algorithm": Archiver.HASH_ALGORITHM, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index dc78803..acbe52c 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -91,6 +91,10 @@ execution: storage: s3 # defaults to false, when true will try to avoid duplicate URL archives check_if_exists: true + + # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256) + # hash_algorithm: SHA-256 + # optional configurations for the selenium browser that takes screenshots, these are the defaults selenium: # values under 10s might mean screenshots fail to grab screenshot @@ -116,7 +120,3 @@ execution: screenshot: screenshot hash: hash - # Must be either SHA256 or SHA3_512 - hash_algorithm: SHA3_512 - # hash_algorithm: SHA256 -