From 16bd54b8d350126c81eb47e2c0d7f1ebc100fa1c Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Tue, 12 Jul 2022 12:44:29 +0100 Subject: [PATCH 01/17] Put in fix for leading / in Google Drive --- storages/gd_storage.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index e772a90..2a92f51 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -28,6 +28,12 @@ class GDStorage(Storage): only support files saved in a folder for GD S3 supports folder and all stored in the root """ + # doesn't work if key starts with / which can happen from telethon + if key.startswith('/'): + # remove first character ie / + logger.debug(f'CDN: Found and fixing leading / on uploading a file with {key=}') + key = key[1:] + full_name = os.path.join(self.folder, key) parent_id, folder_id = self.root_folder_id, None path_parts = full_name.split(os.path.sep) @@ -52,6 +58,13 @@ class GDStorage(Storage): 1. for each sub-folder in the path check if exists or create 2. upload file to root_id/other_paths.../filename """ + # doesn't work if key starts with / which can happen from telethon + if key.startswith('/'): + # remove first character ie / + logger.debug(f'UPLOADF: Found and fixing a leading / on uploading a file with {key=}') + key = key[1:] + + full_name = os.path.join(self.folder, key) parent_id, upload_to = self.root_folder_id, None path_parts = full_name.split(os.path.sep) From 42172566f20cd3ba96a23a2ad2a6343565071a38 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Tue, 12 Jul 2022 12:53:59 +0100 Subject: [PATCH 02/17] Added whitelist and blacklist for workwheets (not spreadsheet) --- auto_archive.py | 13 +++++++++++++ configs/config.py | 4 ++++ example.config.yaml | 8 ++++++++ 3 files changed, 25 insertions(+) diff --git a/auto_archive.py b/auto_archive.py index 375c5be..840ccdc 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -58,6 +58,19 @@ def process_sheet(c: Config): # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): + + whitelist = c.worksheet_whitelist + if whitelist is not None: + if wks.title != whitelist: + logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}') + continue + + blacklist = c.worksheet_blacklist + if blacklist is not None: + if wks.title == blacklist: + logger.debug(f'Ignoring worksheet {wks.title} as in blacklist') + continue + logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) diff --git a/configs/config.py b/configs/config.py index 4232651..98fabe9 100644 --- a/configs/config.py +++ b/configs/config.py @@ -50,6 +50,10 @@ class Config: self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" + + self.worksheet_whitelist = execution.get("worksheet_whitelist") + self.worksheet_blacklist = execution.get("worksheet_blacklist") + self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) diff --git a/example.config.yaml b/example.config.yaml index c5b6a76..8778bba 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -65,6 +65,14 @@ secrets: execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name + + # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. + # worksheet_whitelist: Sheet1 + + # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet + # worksheet_blacklist: MASTERSHEET + + # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1 # which storage to use, can be overwritten with CMD --storage= From 03e542a0fcc269c312c065aeabd4176641a4fc95 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 17:45:28 +0200 Subject: [PATCH 03/17] isolate into function --- storages/gd_storage.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index 2a92f51..d9a11de 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -28,11 +28,7 @@ class GDStorage(Storage): only support files saved in a folder for GD S3 supports folder and all stored in the root """ - # doesn't work if key starts with / which can happen from telethon - if key.startswith('/'): - # remove first character ie / - logger.debug(f'CDN: Found and fixing leading / on uploading a file with {key=}') - key = key[1:] + key = self.clean_key(key) full_name = os.path.join(self.folder, key) parent_id, folder_id = self.root_folder_id, None @@ -58,13 +54,8 @@ class GDStorage(Storage): 1. for each sub-folder in the path check if exists or create 2. upload file to root_id/other_paths.../filename """ - # doesn't work if key starts with / which can happen from telethon - if key.startswith('/'): - # remove first character ie / - logger.debug(f'UPLOADF: Found and fixing a leading / on uploading a file with {key=}') - key = key[1:] + key = self.clean_key(key) - full_name = os.path.join(self.folder, key) parent_id, upload_to = self.root_folder_id, None path_parts = full_name.split(os.path.sep) @@ -90,6 +81,13 @@ class GDStorage(Storage): # GD only requires the filename not a file reader self.uploadf(filename, key, **kwargs) + def clean_key(self, key): + # GDrive does not work well with trailing forward slashes and some keys come with that + if key.startswith('/'): + logger.debug(f'Found and fixed a leading "/" for {key=}') + return key[1:] + return key + def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True): """ Retrieves the id of a folder or file from its @name and the @parent_id folder From 90cb080c811e2575274a0edf1b9f46abfd4aa1ae Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 18:10:02 +0200 Subject: [PATCH 04/17] refactoring and renaming --- auto_archive.py | 31 ++++++++++++++++--------------- configs/config.py | 9 +++++++-- example.config.yaml | 13 +++++++------ 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index 840ccdc..f12b9c4 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet): return missing +def should_process_sheet(c, sheet_name): + if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(c.worksheet_block) and sheet_name in c.worksheet_block: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def process_sheet(c: Config): sh = c.gsheets_client.open(c.sheet) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - - whitelist = c.worksheet_whitelist - if whitelist is not None: - if wks.title != whitelist: - logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}') - continue - - blacklist = c.worksheet_blacklist - if blacklist is not None: - if wks.title == blacklist: - logger.debug(f'Ignoring worksheet {wks.title} as in blacklist') - continue + if not should_process_sheet(c, wks.title): + logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations') + continue logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) @@ -93,7 +94,7 @@ def process_sheet(c: Config): if not is_retry: continue # All checks done - archival process starts here - try: + try: gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) @@ -109,7 +110,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] @@ -118,7 +119,7 @@ def process_sheet(c: Config): try: result = archiver.download(url, check_if_exists=c.check_if_exists) - except KeyboardInterrupt as e: raise e # so the higher level catch can catch it + except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except Exception as e: result = False logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') diff --git a/configs/config.py b/configs/config.py index 98fabe9..e9bd084 100644 --- a/configs/config.py +++ b/configs/config.py @@ -51,8 +51,11 @@ class Config: self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" - self.worksheet_whitelist = execution.get("worksheet_whitelist") - self.worksheet_blacklist = execution.get("worksheet_blacklist") + def ensure_set(l): + l = l if isinstance(l, list) else [l] + return set([x for x in l if isinstance(x, str) and len(x) > 0]) + self.worksheet_allow = ensure_set(execution.get("worksheet_allow", [])) + self.worksheet_block = ensure_set(execution.get("worksheet_block", [])) self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) @@ -250,6 +253,8 @@ class Config: return json.dumps({ "config_file": self.config_file, "sheet": self.sheet, + "worksheet_allow": list(self.worksheet_allow), + "worksheet_block": list(self.worksheet_block), "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, diff --git a/example.config.yaml b/example.config.yaml index 8778bba..3092efc 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -66,12 +66,13 @@ execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name - # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. - # worksheet_whitelist: Sheet1 - - # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet - # worksheet_blacklist: MASTERSHEET - + # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet + # worksheet_allow and worksheet_block can be single values or lists + # if worksheet_allow is specified, worksheet_block is ignored + # worksheet_allow: + # - Sheet1 + # - "Sheet 2" + # worksheet_block: BlockedSheet # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1 From 37e1fcd540e2549ea13a3d22d34a4244d4a640dd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 18:10:53 +0200 Subject: [PATCH 05/17] comment --- configs/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/config.py b/configs/config.py index e9bd084..41b531a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -52,6 +52,7 @@ class Config: assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" def ensure_set(l): + # always returns a set of strings, can receive a set or a string l = l if isinstance(l, list) else [l] return set([x for x in l if isinstance(x, str) and len(x) > 0]) self.worksheet_allow = ensure_set(execution.get("worksheet_allow", [])) From 6d8be4c07f2d3ae6e6cdfda4d8bff4ad22420820 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 18:16:06 +0200 Subject: [PATCH 06/17] s3 allow online preview instead of forced download --- storages/s3_storage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/storages/s3_storage.py b/storages/s3_storage.py index ceb75c1..b124aae 100644 --- a/storages/s3_storage.py +++ b/storages/s3_storage.py @@ -1,4 +1,4 @@ -import uuid, os +import uuid, os, mimetypes from dataclasses import dataclass import boto3 @@ -21,6 +21,7 @@ class S3Config: private: bool = False key_path: str = "default" # 'default' uses full naming, 'random' uses generated uuid + class S3Storage(Storage): def __init__(self, config: S3Config): @@ -70,4 +71,5 @@ class S3Storage(Storage): extra_args = kwargs.get("extra_args", {}) else: extra_args = kwargs.get("extra_args", {'ACL': 'public-read'}) + extra_args['ContentType'] = mimetypes.guess_type(key)[0] self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args) From 363a8ef67a39a55d0386c53a097c6d3a8f1d15a1 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Mon, 18 Jul 2022 13:15:48 +0100 Subject: [PATCH 07/17] Added hash_algorithm to config to choose between SHA256 and SHA3_512 --- .gitignore | 4 +++- archivers/base_archiver.py | 14 +++++++++----- archivers/telegram_archiver.py | 3 +++ archivers/telethon_archiver.py | 4 ++-- archivers/tiktok_archiver.py | 3 +++ archivers/twitter_api_archiver.py | 6 +++--- archivers/twitter_archiver.py | 4 ++++ archivers/vk_archiver.py | 4 ++-- archivers/wayback_archiver.py | 4 ++-- archivers/youtubedl_archiver.py | 4 ++-- auto_archive.py | 16 ++++++++-------- configs/config.py | 2 ++ example.config.yaml | 5 +++++ 13 files changed, 48 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index 2885782..62a5815 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,6 @@ config.yaml config-*.yaml logs/* local_archive/ -vk_config*.json \ No newline at end of file +vk_config*.json + +secrets/* \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 815d31e..8951115 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -31,9 +31,10 @@ class Archiver(ABC): name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver): + def __init__(self, storage: Storage, driver, hash_algorithm): self.storage = storage self.driver = driver + self.hash_algorithm = hash_algorithm def __str__(self): return self.__class__.__name__ @@ -163,10 +164,13 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - # TODO: customizable hash - hash = hashlib.sha256(bytes) - # option to use SHA3_512 instead - # hash = hashlib.sha3_512(bytes) + ha = self.hash_algorithm + logger.debug(f'Hash algorithm is {ha}') + + if ha == "SHA3_512": hash = hashlib.sha3_512(bytes) + elif ha == "SHA256": hash = hashlib.sha256(bytes) + else: raise Exception("Unknown Hash Algorithm of {ha}") + return hash.hexdigest() def get_screenshot(self, url): diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 0b6e777..c38dd30 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,6 +11,9 @@ from storages import Storage class TelegramArchiver(Archiver): name = "telegram" + def __init__(self, storage: Storage, driver, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 't.me' != self.get_netloc(url): diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index f35e323..bce34d2 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -15,8 +15,8 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) if config: self.client = TelegramClient("./anon", config.api_id, config.api_hash) self.bot_token = config.bot_token diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 8100bb1..30b8c7c 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -15,6 +15,9 @@ class TiktokArchiver(Archiver): status = 'success' + def __init__(self, storage: Storage, driver, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) + try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index ef2bf40..99fb8f1 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) if config.bearer_token: self.api = Api(bearer_token=config.bearer_token) @@ -54,7 +54,7 @@ class TwitterApiArchiver(TwitterArchiver): for u in urls: if u is None: - logger.error(f"Should not have gotten None url for {tweet.includes.media=}") + logger.debug(f"Should not have gotten None url for {tweet.includes.media=} so going to download_alternative in twitter_archiver") return self.download_alternative(url, tweet_id) logger.debug(f"found {urls=}") diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 1c1b173..750d2c4 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -5,12 +5,16 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult +from storages import Storage class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods, and it works as an alternative to TwitterApiArchiver when no API credentials are provided. """ + def __init__(self, storage: Storage, driver, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) + name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index c448367..6ddba10 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -17,8 +17,8 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig): - super().__init__(storage, driver) + def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) if config != None: self.vks = VkScraper(config.username, config.password) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index f46d1cb..c19ca4f 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -15,8 +15,8 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig): - super(WaybackArchiver, self).__init__(storage, driver) + def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm): + super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm) self.config = config self.seen_urls = {} diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 7990131..a41b6c6 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie): - super().__init__(storage, driver) + def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm): + super().__init__(storage, driver, hash_algorithm) self.fb_cookie = fb_cookie def download(self, url, check_if_exists=False): diff --git a/auto_archive.py b/auto_archive.py index f12b9c4..72cb748 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -104,14 +104,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config), - TiktokArchiver(storage, c.webdriver), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), - TelegramArchiver(storage, c.webdriver), - TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), - WaybackArchiver(storage, c.webdriver, c.wayback_config) + TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm), + TiktokArchiver(storage, c.webdriver, c.hash_algorithm), + TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm), + TelegramArchiver(storage, c.webdriver, c.hash_algorithm), + TwitterArchiver(storage, c.webdriver, c.hash_algorithm), + VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm), + WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm) ] for archiver in active_archivers: diff --git a/configs/config.py b/configs/config.py index 41b531a..2d134da 100644 --- a/configs/config.py +++ b/configs/config.py @@ -81,6 +81,8 @@ class Config: ) self.webdriver = "not initialized" + self.hash_algorithm = execution.get("hash_algorithm") + # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) diff --git a/example.config.yaml b/example.config.yaml index 3092efc..f823c47 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -104,3 +104,8 @@ execution: duration: duration screenshot: screenshot hash: hash + + # Must be either SHA256 or SHA3_512 + hash_algorithm: SHA3_512 + # hash_algorithm: SHA256 + From 9f9b9d8f634193bc7c202146da09cb8c2e6ac865 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Mon, 18 Jul 2022 13:25:05 +0100 Subject: [PATCH 08/17] adding in GD token --- storages/gd_storage.py | 52 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index d9a11de..e60e37f 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -8,19 +8,54 @@ from googleapiclient.http import MediaFileUpload from google.oauth2 import service_account +from google.oauth2.credentials import Credentials +from google.auth.transport.requests import Request + @dataclass class GDConfig: root_folder_id: str + oauth_token_file_path_and_name: str + service_account: str folder: str = "default" - service_account: str = "service_account.json" - class GDStorage(Storage): def __init__(self, config: GDConfig): self.folder = config.folder self.root_folder_id = config.root_folder_id - creds = service_account.Credentials.from_service_account_file( - config.service_account, scopes=['https://www.googleapis.com/auth/drive']) + + SCOPES=['https://www.googleapis.com/auth/drive'] + + token_file = config.oauth_token_file_path_and_name + if token_file is not None: + """ + Tokens are refreshed after 1 hour + however keep working for 7 days (tbc) + so as long as the job doesn't last for 7 days + then this method of refreshing only once per run will work + see this link for details on the token + https://davemateer.com/2022/04/28/google-drive-with-python#tokens + """ + logger.debug(f'Using GD OAuth token {token_file}') + creds = Credentials.from_authorized_user_file(token_file, SCOPES) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + logger.debug('Requesting new GD OAuth token') + creds.refresh(Request()) + else: + raise Exception("Problem with creds - create the token again") + + # Save the credentials for the next run + with open(token_file, 'w') as token: + logger.debug('Saving new GD OAuth token') + token.write(creds.to_json()) + else: + logger.debug('GD OAuth Token valid') + else: + gd_service_account = config.service_account + logger.debug(f'Using GD Service Account {gd_service_account}') + creds = service_account.Credentials.from_service_account_file(gd_service_account, scopes=SCOPES) + self.service = build('drive', 'v3', credentials=creds) def get_cdn_url(self, key): @@ -88,13 +123,18 @@ class GDStorage(Storage): return key[1:] return key - def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=True): + # gets the Drive folderID if it is there + def _get_id_from_parent_and_name(self, parent_id: str, name: str, retries: int = 1, sleep_seconds: int = 10, use_mime_type: bool = False, raise_on_missing: bool = True, use_cache=False): """ Retrieves the id of a folder or file from its @name and the @parent_id folder Optionally does multiple @retries and sleeps @sleep_seconds between them If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" If @raise_on_missing will throw error when not found, or returns None Will remember previous calls to avoid duplication if @use_cache + DM - caching giving a perf improvement in order of 41s to 46s + So I prefer not to use yet, purely as caching notoriously hard in terms of edge cases + and pro's don't outweigh cons for me (yet) + to be fair I just need to test this and make sure it always runs well! Returns the id of the file or folder from its name as a string """ # cache logic @@ -107,7 +147,7 @@ class GDStorage(Storage): # API logic debug_header: str = f"[searching {name=} in {parent_id=}]" - query_string = f"'{parent_id}' in parents and name = '{name}' " + query_string = f"'{parent_id}' in parents and name = '{name}' and trashed = false " if use_mime_type: query_string += f" and mimeType='application/vnd.google-apps.folder' " From 524b40b8692c00d26a35ca256fbf91dab6369c40 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Mon, 18 Jul 2022 13:39:00 +0100 Subject: [PATCH 09/17] Added Google OAuth flow for Google Drive so can use a real user and not a service account to save files --- configs/config.py | 3 +- create_update_test_oauth_token.py | 77 +++++++++++++++++++++++++++++++ example.config.yaml | 12 ++++- 3 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 create_update_test_oauth_token.py diff --git a/configs/config.py b/configs/config.py index 2d134da..2298c51 100644 --- a/configs/config.py +++ b/configs/config.py @@ -117,7 +117,8 @@ class Config: gd = secrets["google_drive"] self.gd_config = GDConfig( root_folder_id=gd.get("root_folder_id"), - service_account=gd.get("service_account", GDConfig.service_account) + oauth_token_file_path_and_name=gd.get("oauth_token_file_path_and_name"), + service_account=gd.get("service_account") ) if "local" in secrets: diff --git a/create_update_test_oauth_token.py b/create_update_test_oauth_token.py new file mode 100644 index 0000000..cfe2709 --- /dev/null +++ b/create_update_test_oauth_token.py @@ -0,0 +1,77 @@ +from __future__ import print_function + +import os.path + +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials +from google_auth_oauthlib.flow import InstalledAppFlow +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError + +from googleapiclient.http import MediaFileUpload + +# If creating for first time download the json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs +# https://davemateer.com/2022/04/28/google-drive-with-python for more information + +# Can run this code to get a new token and verify the token is the correct user +# and it will refresh the token accordingly + +# Code below from https://developers.google.com/drive/api/quickstart/python + +SCOPES = ['https://www.googleapis.com/auth/drive'] + +def main(): + # token_file = 'gd-token.json' + + token_file = 'secrets/token-davemateer-gmail.json' + + creds = None + + # The file token.json stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists(token_file): + creds = Credentials.from_authorized_user_file(token_file, SCOPES) + + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + print('Requesting new token') + creds.refresh(Request()) + else: + print('First run through so putting up login dialog') + # credentials.json downloaded from https://console.cloud.google.com/apis/credentials + flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open(token_file, 'w') as token: + print('Saving new token') + token.write(creds.to_json()) + else: + print('Token valid') + + try: + service = build('drive', 'v3', credentials=creds) + + # About the user + results = service.about().get(fields="*").execute() + emailAddress = results['user']['emailAddress'] + print(emailAddress) + + # Call the Drive v3 API and return some files + results = service.files().list( + pageSize=10, fields="nextPageToken, files(id, name)").execute() + items = results.get('files', []) + + if not items: + print('No files found.') + return + print('Files:') + for item in items: + print(u'{0} ({1})'.format(item['name'], item['id'])) + + except HttpError as error: + print(f'An error occurred: {error}') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/example.config.yaml b/example.config.yaml index f823c47..60753fa 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -18,8 +18,16 @@ secrets: # needed if you use storage=gd google_drive: - # local filename can be the same or different file from google_sheets.service_account, defaults to service_account.json - service_account: "service_account.json" + # 1.service account to write to google storage - be aware of 15GB limit. Recommend using OAuth user. + # filename can be the same or different file from google_sheets.service_account + # service_account: "service_account.json" + + # 2.token (only 1. or 2. - if both specified then this 2. token takes precedence) + # will need to have write access on the server so refresh flow works + # run the file `create_update_test_oauth_token.py` to create the token and save in a secrets directory so + # it is not checked into source control + oauth_token_file_path_and_name: "secrets/token-davemateer-gmail.json" + root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX # needed if you use storage=local From 7b8be95e250dc3f0c42ca8840972b08048eacadf Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:12:14 +0100 Subject: [PATCH 10/17] removing empty line --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 62a5815..2059faa 100644 --- a/.gitignore +++ b/.gitignore @@ -17,5 +17,4 @@ config-*.yaml logs/* local_archive/ vk_config*.json - secrets/* \ No newline at end of file From 2d7d8c4e0803095a967ff78aeba42933a1a8f835 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:12:43 +0100 Subject: [PATCH 11/17] renaming and making default SHA-256 --- archivers/base_archiver.py | 4 ++-- configs/config.py | 3 ++- example.config.yaml | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 8951115..b377d31 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -167,8 +167,8 @@ class Archiver(ABC): ha = self.hash_algorithm logger.debug(f'Hash algorithm is {ha}') - if ha == "SHA3_512": hash = hashlib.sha3_512(bytes) - elif ha == "SHA256": hash = hashlib.sha256(bytes) + if ha == "SHA3-512": hash = hashlib.sha3_512(bytes) + elif ha == "SHA-256": hash = hashlib.sha256(bytes) else: raise Exception("Unknown Hash Algorithm of {ha}") return hash.hexdigest() diff --git a/configs/config.py b/configs/config.py index 2d134da..4003282 100644 --- a/configs/config.py +++ b/configs/config.py @@ -81,7 +81,7 @@ class Config: ) self.webdriver = "not initialized" - self.hash_algorithm = execution.get("hash_algorithm") + self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -261,6 +261,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, + "hash_algorithm": self.hash_algorithm, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, diff --git a/example.config.yaml b/example.config.yaml index f823c47..2cded09 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -80,6 +80,10 @@ execution: storage: s3 # defaults to false, when true will try to avoid duplicate URL archives check_if_exists: true + + # choose a hash algorithm (either SHA-256 or SHA3-512, defaults to SHA-256) + # hash_algorithm: SHA-256 + # optional configurations for the selenium browser that takes screenshots, these are the defaults selenium: # values under 10s might mean screenshots fail to grab screenshot @@ -105,7 +109,3 @@ execution: screenshot: screenshot hash: hash - # Must be either SHA256 or SHA3_512 - hash_algorithm: SHA3_512 - # hash_algorithm: SHA256 - From 9317b5e03582a6a27d7eb3318fcbe8ea870fc091 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:27:50 +0100 Subject: [PATCH 12/17] turning HASH_ALGORITHM into global archiver prop --- archivers/base_archiver.py | 15 ++++++--------- archivers/telegram_archiver.py | 4 ++-- archivers/telethon_archiver.py | 4 ++-- archivers/tiktok_archiver.py | 4 ++-- archivers/twitter_api_archiver.py | 4 ++-- archivers/twitter_archiver.py | 4 ++-- archivers/vk_archiver.py | 4 ++-- archivers/wayback_archiver.py | 4 ++-- archivers/youtubedl_archiver.py | 4 ++-- auto_archive.py | 16 ++++++++-------- configs/config.py | 5 +++-- 11 files changed, 33 insertions(+), 35 deletions(-) diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index b377d31..902f626 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -26,15 +26,14 @@ class ArchiveResult: screenshot: str = None hash: str = None - class Archiver(ABC): + HASH_ALGORITHM="SHA-256" # can be overwritten by user configs name = "default" retry_regex = r"retrying at (\d+)$" - def __init__(self, storage: Storage, driver, hash_algorithm): + def __init__(self, storage: Storage, driver): self.storage = storage self.driver = driver - self.hash_algorithm = hash_algorithm def __str__(self): return self.__class__.__name__ @@ -48,7 +47,6 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc - # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None): """ Generates an index.html page where each @urls_info is displayed @@ -164,12 +162,11 @@ class Archiver(ABC): def get_hash(self, filename): with open(filename, "rb") as f: bytes = f.read() # read entire file as bytes - ha = self.hash_algorithm - logger.debug(f'Hash algorithm is {ha}') + logger.debug(f'Hash algorithm is {self.HASH_ALGORITHM}') - if ha == "SHA3-512": hash = hashlib.sha3_512(bytes) - elif ha == "SHA-256": hash = hashlib.sha256(bytes) - else: raise Exception("Unknown Hash Algorithm of {ha}") + if self.HASH_ALGORITHM == "SHA-256": hash = hashlib.sha256(bytes) + elif self.HASH_ALGORITHM == "SHA3-512": hash = hashlib.sha3_512(bytes) + else: raise Exception(f"Unknown Hash Algorithm of {self.HASH_ALGORITHM}") return hash.hexdigest() diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index c38dd30..4b2e59c 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,8 +11,8 @@ from storages import Storage class TelegramArchiver(Archiver): name = "telegram" - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver): + super().__init__(storage, driver) def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle diff --git a/archivers/telethon_archiver.py b/archivers/telethon_archiver.py index bce34d2..f35e323 100644 --- a/archivers/telethon_archiver.py +++ b/archivers/telethon_archiver.py @@ -15,8 +15,8 @@ class TelethonArchiver(Archiver): name = "telethon" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") - def __init__(self, storage: Storage, driver, config: TelethonConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: TelethonConfig): + super().__init__(storage, driver) if config: self.client = TelegramClient("./anon", config.api_id, config.api_hash) self.bot_token = config.bot_token diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 30b8c7c..771a7f4 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -15,8 +15,8 @@ class TiktokArchiver(Archiver): status = 'success' - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver): + super().__init__(storage, driver) try: info = tiktok_downloader.info_post(url) diff --git a/archivers/twitter_api_archiver.py b/archivers/twitter_api_archiver.py index 99fb8f1..6aa1742 100644 --- a/archivers/twitter_api_archiver.py +++ b/archivers/twitter_api_archiver.py @@ -13,8 +13,8 @@ from .twitter_archiver import TwitterArchiver class TwitterApiArchiver(TwitterArchiver): name = "twitter_api" - def __init__(self, storage: Storage, driver, config: TwitterApiConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: TwitterApiConfig): + super().__init__(storage, driver) if config.bearer_token: self.api = Api(bearer_token=config.bearer_token) diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 750d2c4..6fe5901 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -12,8 +12,8 @@ class TwitterArchiver(Archiver): This Twitter Archiver uses unofficial scraping methods, and it works as an alternative to TwitterApiArchiver when no API credentials are provided. """ - def __init__(self, storage: Storage, driver, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver): + super().__init__(storage, driver) name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") diff --git a/archivers/vk_archiver.py b/archivers/vk_archiver.py index 6ddba10..c448367 100644 --- a/archivers/vk_archiver.py +++ b/archivers/vk_archiver.py @@ -17,8 +17,8 @@ class VkArchiver(Archiver): wall_pattern = re.compile(r"(wall.{0,1}\d+_\d+)") photo_pattern = re.compile(r"(photo.{0,1}\d+_\d+)") - def __init__(self, storage: Storage, driver, config: VkConfig, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: VkConfig): + super().__init__(storage, driver) if config != None: self.vks = VkScraper(config.username, config.password) diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index c19ca4f..f46d1cb 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -15,8 +15,8 @@ class WaybackArchiver(Archiver): """ name = "wayback" - def __init__(self, storage: Storage, driver, config: WaybackConfig, hash_algorithm): - super(WaybackArchiver, self).__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, config: WaybackConfig): + super(WaybackArchiver, self).__init__(storage, driver) self.config = config self.seen_urls = {} diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index a41b6c6..7990131 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -12,8 +12,8 @@ class YoutubeDLArchiver(Archiver): name = "youtube_dl" ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False} - def __init__(self, storage: Storage, driver, fb_cookie, hash_algorithm): - super().__init__(storage, driver, hash_algorithm) + def __init__(self, storage: Storage, driver, fb_cookie): + super().__init__(storage, driver) self.fb_cookie = fb_cookie def download(self, url, check_if_exists=False): diff --git a/auto_archive.py b/auto_archive.py index 72cb748..c9a6b08 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -104,14 +104,14 @@ def process_sheet(c: Config): # order matters, first to succeed excludes remaining active_archivers = [ - TelethonArchiver(storage, c.webdriver, c.telegram_config, c.hash_algorithm), - TiktokArchiver(storage, c.webdriver, c.hash_algorithm), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config, c.hash_algorithm), - YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie,c.hash_algorithm), - TelegramArchiver(storage, c.webdriver, c.hash_algorithm), - TwitterArchiver(storage, c.webdriver, c.hash_algorithm), - VkArchiver(storage, c.webdriver, c.vk_config, c.hash_algorithm), - WaybackArchiver(storage, c.webdriver, c.wayback_config, c.hash_algorithm) + TelethonArchiver(storage, c.webdriver, c.telegram_config), + TiktokArchiver(storage, c.webdriver), + TwitterApiArchiver(storage, c.webdriver, c.twitter_config,), + YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), + TelegramArchiver(storage, c.webdriver), + TwitterArchiver(storage, c.webdriver), + VkArchiver(storage, c.webdriver, c.vk_config), + WaybackArchiver(storage, c.webdriver, c.wayback_config) ] for archiver in active_archivers: diff --git a/configs/config.py b/configs/config.py index 4003282..063c4d7 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,5 +1,6 @@ import argparse, yaml, json +from archivers.base_archiver import Archiver import gspread from loguru import logger from selenium import webdriver @@ -81,7 +82,7 @@ class Config: ) self.webdriver = "not initialized" - self.hash_algorithm = execution.get("hash_algorithm", "SHA-256") + Archiver.HASH_ALGORITHM = execution.get("hash_algorithm", Archiver.HASH_ALGORITHM) # ---------------------- SECRETS - APIs and service configurations secrets = self.config.get("secrets", {}) @@ -261,7 +262,7 @@ class Config: "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, - "hash_algorithm": self.hash_algorithm, + "hash_algorithm": Archiver.HASH_ALGORITHM, "save_logs": self.save_logs, "selenium_config": asdict(self.selenium_config), "selenium_webdriver": self.webdriver != None, From e180b82b0d7e7f0168d5771fcbc8f8bf385b4cca Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:29:42 +0100 Subject: [PATCH 13/17] removing useless constructors --- archivers/telegram_archiver.py | 3 --- archivers/tiktok_archiver.py | 3 --- archivers/twitter_archiver.py | 4 ---- 3 files changed, 10 deletions(-) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index 4b2e59c..0b6e777 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -11,9 +11,6 @@ from storages import Storage class TelegramArchiver(Archiver): name = "telegram" - def __init__(self, storage: Storage, driver): - super().__init__(storage, driver) - def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 't.me' != self.get_netloc(url): diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 771a7f4..8100bb1 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -15,9 +15,6 @@ class TiktokArchiver(Archiver): status = 'success' - def __init__(self, storage: Storage, driver): - super().__init__(storage, driver) - try: info = tiktok_downloader.info_post(url) key = self.get_key(f'{info.id}.mp4') diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 6fe5901..8f646fd 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -5,15 +5,11 @@ from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult -from storages import Storage - class TwitterArchiver(Archiver): """ This Twitter Archiver uses unofficial scraping methods, and it works as an alternative to TwitterApiArchiver when no API credentials are provided. """ - def __init__(self, storage: Storage, driver): - super().__init__(storage, driver) name = "twitter" link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") From 63140d69c145adb44c22081e7742f1bdc6ca633f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:35:27 +0100 Subject: [PATCH 14/17] format --- auto_archive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index c9a6b08..f12b9c4 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -106,11 +106,11 @@ def process_sheet(c: Config): active_archivers = [ TelethonArchiver(storage, c.webdriver, c.telegram_config), TiktokArchiver(storage, c.webdriver), - TwitterApiArchiver(storage, c.webdriver, c.twitter_config,), + TwitterApiArchiver(storage, c.webdriver, c.twitter_config), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] From 6124bc5f72b9d2c6c3af62169bae5140be8f3f15 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 14:52:50 +0100 Subject: [PATCH 15/17] refactored and simplified obtaining credentials --- .gitignore | 3 ++- configs/config.py | 4 ++-- create_update_test_oauth_token.py | 22 +++++++++------------- example.config.yaml | 17 ++++++++++------- storages/gd_storage.py | 6 +++--- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 62a5815..8da75c3 100644 --- a/.gitignore +++ b/.gitignore @@ -17,5 +17,6 @@ config-*.yaml logs/* local_archive/ vk_config*.json - +gd-token.json +credentials.json secrets/* \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 2298c51..1169048 100644 --- a/configs/config.py +++ b/configs/config.py @@ -117,8 +117,8 @@ class Config: gd = secrets["google_drive"] self.gd_config = GDConfig( root_folder_id=gd.get("root_folder_id"), - oauth_token_file_path_and_name=gd.get("oauth_token_file_path_and_name"), - service_account=gd.get("service_account") + oauth_token_filename=gd.get("oauth_token_filename"), + service_account=gd.get("service_account", GDConfig.service_account) ) if "local" in secrets: diff --git a/create_update_test_oauth_token.py b/create_update_test_oauth_token.py index cfe2709..65b3086 100644 --- a/create_update_test_oauth_token.py +++ b/create_update_test_oauth_token.py @@ -1,5 +1,3 @@ -from __future__ import print_function - import os.path from google.auth.transport.requests import Request @@ -8,23 +6,20 @@ from google_auth_oauthlib.flow import InstalledAppFlow from googleapiclient.discovery import build from googleapiclient.errors import HttpError -from googleapiclient.http import MediaFileUpload - -# If creating for first time download the json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs +# If creating for first time download the OAuth Client Ids json `credentials.json` from https://console.cloud.google.com/apis/credentials OAuth 2.0 Client IDs +# add "http://localhost:55192/" to the list of "Authorised redirect URIs" # https://davemateer.com/2022/04/28/google-drive-with-python for more information -# Can run this code to get a new token and verify the token is the correct user -# and it will refresh the token accordingly +# You can run this code to get a new token and verify it belongs to the correct user +# This token will be refresh automatically by the auto-archiver # Code below from https://developers.google.com/drive/api/quickstart/python SCOPES = ['https://www.googleapis.com/auth/drive'] + def main(): - # token_file = 'gd-token.json' - - token_file = 'secrets/token-davemateer-gmail.json' - + token_file = 'gd-token.json' creds = None # The file token.json stores the user's access and refresh tokens, and is @@ -42,7 +37,7 @@ def main(): print('First run through so putting up login dialog') # credentials.json downloaded from https://console.cloud.google.com/apis/credentials flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES) - creds = flow.run_local_server(port=0) + creds = flow.run_local_server(port=55192) # Save the credentials for the next run with open(token_file, 'w') as token: print('Saving new token') @@ -73,5 +68,6 @@ def main(): except HttpError as error: print(f'An error occurred: {error}') + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/example.config.yaml b/example.config.yaml index 60753fa..dc78803 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -18,15 +18,18 @@ secrets: # needed if you use storage=gd google_drive: - # 1.service account to write to google storage - be aware of 15GB limit. Recommend using OAuth user. - # filename can be the same or different file from google_sheets.service_account + # To authenticate with google you have two options (1. service account OR 2. OAuth token) + + # 1. service account - storage space will count towards the developer account + # filename can be the same or different file from google_sheets.service_account, defaults to "service_account.json" # service_account: "service_account.json" - # 2.token (only 1. or 2. - if both specified then this 2. token takes precedence) - # will need to have write access on the server so refresh flow works - # run the file `create_update_test_oauth_token.py` to create the token and save in a secrets directory so - # it is not checked into source control - oauth_token_file_path_and_name: "secrets/token-davemateer-gmail.json" + # 2. OAuth token - storage space will count towards the owner of the GDrive folder + # (only 1. or 2. - if both specified then this 2. takes precedence) + # needs write access on the server so refresh flow works + # To get the token, run the file `create_update_test_oauth_token.py` + # you can edit that file if you want a different token filename, default is "gd-token.json" + oauth_token_filename: "gd-token.json" root_folder_id: copy XXXX from https://drive.google.com/drive/folders/XXXX diff --git a/storages/gd_storage.py b/storages/gd_storage.py index e60e37f..be12625 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -14,8 +14,8 @@ from google.auth.transport.requests import Request @dataclass class GDConfig: root_folder_id: str - oauth_token_file_path_and_name: str - service_account: str + oauth_token_filename: str + service_account: str = "service_account.json" folder: str = "default" class GDStorage(Storage): @@ -25,7 +25,7 @@ class GDStorage(Storage): SCOPES=['https://www.googleapis.com/auth/drive'] - token_file = config.oauth_token_file_path_and_name + token_file = config.oauth_token_filename if token_file is not None: """ Tokens are refreshed after 1 hour From 992dee022a366718ce22e58d9d66783daf12247e Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 25 Jul 2022 14:59:04 +0100 Subject: [PATCH 16/17] format --- storages/gd_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index be12625..933c168 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -28,7 +28,7 @@ class GDStorage(Storage): token_file = config.oauth_token_filename if token_file is not None: """ - Tokens are refreshed after 1 hour + Tokens are refreshed after 1 hour however keep working for 7 days (tbc) so as long as the job doesn't last for 7 days then this method of refreshing only once per run will work From c77b4a080a84c701a8a3c000776d269576d02027 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 21 Sep 2022 18:52:23 +0200 Subject: [PATCH 17/17] update comment --- storages/gd_storage.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/storages/gd_storage.py b/storages/gd_storage.py index 933c168..5f3bbeb 100644 --- a/storages/gd_storage.py +++ b/storages/gd_storage.py @@ -130,11 +130,7 @@ class GDStorage(Storage): Optionally does multiple @retries and sleeps @sleep_seconds between them If @use_mime_type will restrict search to "mimeType='application/vnd.google-apps.folder'" If @raise_on_missing will throw error when not found, or returns None - Will remember previous calls to avoid duplication if @use_cache - DM - caching giving a perf improvement in order of 41s to 46s - So I prefer not to use yet, purely as caching notoriously hard in terms of edge cases - and pro's don't outweigh cons for me (yet) - to be fair I just need to test this and make sure it always runs well! + Will remember previous calls to avoid duplication if @use_cache - might not have all edge cases tested, so use at own risk Returns the id of the file or folder from its name as a string """ # cache logic