Merge pull request #51 from djhmateer/whitelist

This commit is contained in:
Miguel Sozinho Ramalho
2022-07-14 17:13:29 +01:00
committed by GitHub
3 changed files with 36 additions and 3 deletions

View File

@@ -53,11 +53,25 @@ def missing_required_columns(gw: GWorksheet):
return missing
def should_process_sheet(c, sheet_name):
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
# ALLOW rules exist AND sheet name not explicitly allowed
return False
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
# BLOCK rules exist AND sheet name is blocked
return False
return True
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
if not should_process_sheet(c, wks.title):
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
continue
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
@@ -80,7 +94,7 @@ def process_sheet(c: Config):
if not is_retry: continue
# All checks done - archival process starts here
try:
try:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
@@ -96,7 +110,7 @@ def process_sheet(c: Config):
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
VkArchiver(storage, c.webdriver, c.vk_config),
VkArchiver(storage, c.webdriver, c.vk_config),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
@@ -105,7 +119,7 @@ def process_sheet(c: Config):
try:
result = archiver.download(url, check_if_exists=c.check_if_exists)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')

View File

@@ -50,6 +50,14 @@ class Config:
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
def ensure_set(l):
# always returns a set of strings, can receive a set or a string
l = l if isinstance(l, list) else [l]
return set([x for x in l if isinstance(x, str) and len(x) > 0])
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
self.save_logs = getattr(self.args, "save_logs") or execution.get("save_logs", False)
@@ -246,6 +254,8 @@ class Config:
return json.dumps({
"config_file": self.config_file,
"sheet": self.sheet,
"worksheet_allow": list(self.worksheet_allow),
"worksheet_block": list(self.worksheet_block),
"storage": self.storage,
"header": self.header,
"check_if_exists": self.check_if_exists,

View File

@@ -65,6 +65,15 @@ secrets:
execution:
# can be overwritten with CMD --sheet=
sheet: your-sheet-name
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
# worksheet_allow and worksheet_block can be single values or lists
# if worksheet_allow is specified, worksheet_block is ignored
# worksheet_allow:
# - Sheet1
# - "Sheet 2"
# worksheet_block: BlockedSheet
# which row of your tabs contains the header, can be overwritten with CMD --header=
header: 1
# which storage to use, can be overwritten with CMD --storage=