refactoring and renaming

This commit is contained in:
msramalho
2022-07-14 18:10:02 +02:00
parent 42172566f2
commit 90cb080c81
3 changed files with 30 additions and 23 deletions

View File

@@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet):
return missing return missing
def should_process_sheet(c, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
# BLOCK rules exist AND sheet name is blocked
return False
return True
def process_sheet(c: Config): def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet) sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check # loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):
if not should_process_sheet(c, wks.title):
whitelist = c.worksheet_whitelist logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
if whitelist is not None: continue
if wks.title != whitelist:
logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}')
continue
blacklist = c.worksheet_blacklist
if blacklist is not None:
if wks.title == blacklist:
logger.debug(f'Ignoring worksheet {wks.title} as in blacklist')
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
@@ -93,7 +94,7 @@ def process_sheet(c: Config):
if not is_retry: continue if not is_retry: continue
# All checks done - archival process starts here # All checks done - archival process starts here
try: try:
gw.set_cell(row, 'status', 'Archive in progress') gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url) url = expand_url(url)
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@@ -109,7 +110,7 @@ def process_sheet(c: Config):
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver), TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config), VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config) WaybackArchiver(storage, c.webdriver, c.wayback_config)
] ]
@@ -118,7 +119,7 @@ def process_sheet(c: Config):
try: try:
result = archiver.download(url, check_if_exists=c.check_if_exists) result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e: except Exception as e:
result = False result = False
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')

View File

@@ -51,8 +51,11 @@ class Config:
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
self.worksheet_whitelist = execution.get("worksheet_whitelist") def ensure_set(l):
self.worksheet_blacklist = execution.get("worksheet_blacklist") l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
@@ -250,6 +253,8 @@ class Config:
return json.dumps({ return json.dumps({
"config_file": self.config_file, "config_file": self.config_file,
"sheet": self.sheet, "sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage, "storage": self.storage,
"header": self.header, "header": self.header,
"check_if_exists": self.check_if_exists, "check_if_exists": self.check_if_exists,

View File

@@ -66,12 +66,13 @@ execution:
# can be overwritten with CMD --sheet= # can be overwritten with CMD --sheet=
sheet: your-sheet-name sheet: your-sheet-name
# only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive. # block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
# worksheet_whitelist: Sheet1 # worksheet_allow and worksheet_block can be single values or lists
# if worksheet_allow is specified, worksheet_block is ignored
# worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet # worksheet_allow:
# worksheet_blacklist: MASTERSHEET # - Sheet1
# - "Sheet 2"
# worksheet_block: BlockedSheet
# which row of your tabs contains the header, can be overwritten with CMD --header= # which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1 header: 1