Merge pull request #51 from djhmateer/whitelist

This commit is contained in:
Miguel Sozinho Ramalho
2022-07-14 17:13:29 +01:00
committed by GitHub
3 changed files with 36 additions and 3 deletions

View File

@@ -53,11 +53,25 @@ def missing_required_columns(gw: GWorksheet):
return missing return missing
def should_process_sheet(c, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
# BLOCK rules exist AND sheet name is blocked
return False
return True
def process_sheet(c: Config): def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet) sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check # loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()): for ii, wks in enumerate(sh.worksheets()):
if not should_process_sheet(c, wks.title):
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}') logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names) gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
@@ -80,7 +94,7 @@ def process_sheet(c: Config):
if not is_retry: continue if not is_retry: continue
# All checks done - archival process starts here # All checks done - archival process starts here
try: try:
gw.set_cell(row, 'status', 'Archive in progress') gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url) url = expand_url(url)
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True)) c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@@ -96,7 +110,7 @@ def process_sheet(c: Config):
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie), YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver), TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver), TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config), VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config) WaybackArchiver(storage, c.webdriver, c.wayback_config)
] ]
@@ -105,7 +119,7 @@ def process_sheet(c: Config):
try: try:
result = archiver.download(url, check_if_exists=c.check_if_exists) result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e: except Exception as e:
result = False result = False
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}') logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')

View File

@@ -50,6 +50,14 @@ class Config:
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet")) self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file" assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
def ensure_set(l):
# always returns a set of strings, can receive a set or a string
l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1))) self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3")) self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False) self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
@@ -246,6 +254,8 @@ class Config:
return json.dumps({ return json.dumps({
"config_file": self.config_file, "config_file": self.config_file,
"sheet": self.sheet, "sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage, "storage": self.storage,
"header": self.header, "header": self.header,
"check_if_exists": self.check_if_exists, "check_if_exists": self.check_if_exists,

View File

@@ -65,6 +65,15 @@ secrets:
execution: execution:
# can be overwritten with CMD --sheet= # can be overwritten with CMD --sheet=
sheet: your-sheet-name sheet: your-sheet-name
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
# worksheet_allow and worksheet_block can be single values or lists
# if worksheet_allow is specified, worksheet_block is ignored
# worksheet_allow:
# - Sheet1
# - "Sheet 2"
# worksheet_block: BlockedSheet
# which row of your tabs contains the header, can be overwritten with CMD --header= # which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1 header: 1
# which storage to use, can be overwritten with CMD --storage= # which storage to use, can be overwritten with CMD --storage=