From 42172566f20cd3ba96a23a2ad2a6343565071a38 Mon Sep 17 00:00:00 2001 From: Dave Mateer Date: Tue, 12 Jul 2022 12:53:59 +0100 Subject: [PATCH 1/3] Added whitelist and blacklist for workwheets (not spreadsheet) --- auto_archive.py | 13 +++++++++++++ configs/config.py | 4 ++++ example.config.yaml | 8 ++++++++ 3 files changed, 25 insertions(+) diff --git a/auto_archive.py b/auto_archive.py index 375c5be..840ccdc 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -58,6 +58,19 @@ def process_sheet(c: Config): # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): + + whitelist = c.worksheet_whitelist + if whitelist is not None: + if wks.title != whitelist: + logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}') + continue + + blacklist = c.worksheet_blacklist + if blacklist is not None: + if wks.title == blacklist: + logger.debug(f'Ignoring worksheet {wks.title} as in blacklist') + continue + logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) diff --git a/configs/config.py b/configs/config.py index 4232651..98fabe9 100644 --- a/configs/config.py +++ b/configs/config.py @@ -50,6 +50,10 @@ class Config: self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" + + self.worksheet_whitelist = execution.get("worksheet_whitelist") + self.worksheet_blacklist = execution.get("worksheet_blacklist") + self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) diff --git a/example.config.yaml b/example.config.yaml index c5b6a76..8778bba 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -65,6 +65,14 @@ secrets: execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name + + # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. + # worksheet_whitelist: Sheet1 + + # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet + # worksheet_blacklist: MASTERSHEET + + # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1 # which storage to use, can be overwritten with CMD --storage= From 90cb080c811e2575274a0edf1b9f46abfd4aa1ae Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 18:10:02 +0200 Subject: [PATCH 2/3] refactoring and renaming --- auto_archive.py | 31 ++++++++++++++++--------------- configs/config.py | 9 +++++++-- example.config.yaml | 13 +++++++------ 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/auto_archive.py b/auto_archive.py index 840ccdc..f12b9c4 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet): return missing +def should_process_sheet(c, sheet_name): + if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(c.worksheet_block) and sheet_name in c.worksheet_block: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def process_sheet(c: Config): sh = c.gsheets_client.open(c.sheet) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - - whitelist = c.worksheet_whitelist - if whitelist is not None: - if wks.title != whitelist: - logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}') - continue - - blacklist = c.worksheet_blacklist - if blacklist is not None: - if wks.title == blacklist: - logger.debug(f'Ignoring worksheet {wks.title} as in blacklist') - continue + if not should_process_sheet(c, wks.title): + logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations') + continue logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) @@ -93,7 +94,7 @@ def process_sheet(c: Config): if not is_retry: continue # All checks done - archival process starts here - try: + try: gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) @@ -109,7 +110,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] @@ -118,7 +119,7 @@ def process_sheet(c: Config): try: result = archiver.download(url, check_if_exists=c.check_if_exists) - except KeyboardInterrupt as e: raise e # so the higher level catch can catch it + except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except Exception as e: result = False logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') diff --git a/configs/config.py b/configs/config.py index 98fabe9..e9bd084 100644 --- a/configs/config.py +++ b/configs/config.py @@ -51,8 +51,11 @@ class Config: self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" - self.worksheet_whitelist = execution.get("worksheet_whitelist") - self.worksheet_blacklist = execution.get("worksheet_blacklist") + def ensure_set(l): + l = l if isinstance(l, list) else [l] + return set([x for x in l if isinstance(x, str) and len(x) > 0]) + self.worksheet_allow = ensure_set(execution.get("worksheet_allow", [])) + self.worksheet_block = ensure_set(execution.get("worksheet_block", [])) self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) @@ -250,6 +253,8 @@ class Config: return json.dumps({ "config_file": self.config_file, "sheet": self.sheet, + "worksheet_allow": list(self.worksheet_allow), + "worksheet_block": list(self.worksheet_block), "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, diff --git a/example.config.yaml b/example.config.yaml index 8778bba..3092efc 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -66,12 +66,13 @@ execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name - # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. - # worksheet_whitelist: Sheet1 - - # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet - # worksheet_blacklist: MASTERSHEET - + # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet + # worksheet_allow and worksheet_block can be single values or lists + # if worksheet_allow is specified, worksheet_block is ignored + # worksheet_allow: + # - Sheet1 + # - "Sheet 2" + # worksheet_block: BlockedSheet # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1 From 37e1fcd540e2549ea13a3d22d34a4244d4a640dd Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 14 Jul 2022 18:10:53 +0200 Subject: [PATCH 3/3] comment --- configs/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/config.py b/configs/config.py index e9bd084..41b531a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -52,6 +52,7 @@ class Config: assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" def ensure_set(l): + # always returns a set of strings, can receive a set or a string l = l if isinstance(l, list) else [l] return set([x for x in l if isinstance(x, str) and len(x) > 0]) self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))