diff --git a/auto_archive.py b/auto_archive.py index 840ccdc..f12b9c4 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet): return missing +def should_process_sheet(c, sheet_name): + if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow: + # ALLOW rules exist AND sheet name not explicitly allowed + return False + if len(c.worksheet_block) and sheet_name in c.worksheet_block: + # BLOCK rules exist AND sheet name is blocked + return False + return True + + def process_sheet(c: Config): sh = c.gsheets_client.open(c.sheet) # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): - - whitelist = c.worksheet_whitelist - if whitelist is not None: - if wks.title != whitelist: - logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}') - continue - - blacklist = c.worksheet_blacklist - if blacklist is not None: - if wks.title == blacklist: - logger.debug(f'Ignoring worksheet {wks.title} as in blacklist') - continue + if not should_process_sheet(c, wks.title): + logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations') + continue logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) @@ -93,7 +94,7 @@ def process_sheet(c: Config): if not is_retry: continue # All checks done - archival process starts here - try: + try: gw.set_cell(row, 'status', 'Archive in progress') url = expand_url(url) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) @@ -109,7 +110,7 @@ def process_sheet(c: Config): YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), TelegramArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver), - VkArchiver(storage, c.webdriver, c.vk_config), + VkArchiver(storage, c.webdriver, c.vk_config), WaybackArchiver(storage, c.webdriver, c.wayback_config) ] @@ -118,7 +119,7 @@ def process_sheet(c: Config): try: result = archiver.download(url, check_if_exists=c.check_if_exists) - except KeyboardInterrupt as e: raise e # so the higher level catch can catch it + except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except Exception as e: result = False logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') diff --git a/configs/config.py b/configs/config.py index 98fabe9..e9bd084 100644 --- a/configs/config.py +++ b/configs/config.py @@ -51,8 +51,11 @@ class Config: self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" - self.worksheet_whitelist = execution.get("worksheet_whitelist") - self.worksheet_blacklist = execution.get("worksheet_blacklist") + def ensure_set(l): + l = l if isinstance(l, list) else [l] + return set([x for x in l if isinstance(x, str) and len(x) > 0]) + self.worksheet_allow = ensure_set(execution.get("worksheet_allow", [])) + self.worksheet_block = ensure_set(execution.get("worksheet_block", [])) self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) @@ -250,6 +253,8 @@ class Config: return json.dumps({ "config_file": self.config_file, "sheet": self.sheet, + "worksheet_allow": list(self.worksheet_allow), + "worksheet_block": list(self.worksheet_block), "storage": self.storage, "header": self.header, "check_if_exists": self.check_if_exists, diff --git a/example.config.yaml b/example.config.yaml index 8778bba..3092efc 100644 --- a/example.config.yaml +++ b/example.config.yaml @@ -66,12 +66,13 @@ execution: # can be overwritten with CMD --sheet= sheet: your-sheet-name - # only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. - # worksheet_whitelist: Sheet1 - - # worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet - # worksheet_blacklist: MASTERSHEET - + # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet + # worksheet_allow and worksheet_block can be single values or lists + # if worksheet_allow is specified, worksheet_block is ignored + # worksheet_allow: + # - Sheet1 + # - "Sheet 2" + # worksheet_block: BlockedSheet # which row of your tabs contains the header, can be overwritten with CMD --header= header: 1