mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
refactoring and renaming
This commit is contained in:
@@ -53,23 +53,24 @@ def missing_required_columns(gw: GWorksheet):
|
|||||||
return missing
|
return missing
|
||||||
|
|
||||||
|
|
||||||
|
def should_process_sheet(c, sheet_name):
|
||||||
|
if len(c.worksheet_allow) and sheet_name not in c.worksheet_allow:
|
||||||
|
# ALLOW rules exist AND sheet name not explicitly allowed
|
||||||
|
return False
|
||||||
|
if len(c.worksheet_block) and sheet_name in c.worksheet_block:
|
||||||
|
# BLOCK rules exist AND sheet name is blocked
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(c: Config):
|
def process_sheet(c: Config):
|
||||||
sh = c.gsheets_client.open(c.sheet)
|
sh = c.gsheets_client.open(c.sheet)
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
|
if not should_process_sheet(c, wks.title):
|
||||||
whitelist = c.worksheet_whitelist
|
logger.info(f'Ignoring worksheet "{wks.title}" due to allow/block configurations')
|
||||||
if whitelist is not None:
|
continue
|
||||||
if wks.title != whitelist:
|
|
||||||
logger.debug(f'Ignoring worksheet {wks.title} as not in whitelist which is specified as {whitelist}')
|
|
||||||
continue
|
|
||||||
|
|
||||||
blacklist = c.worksheet_blacklist
|
|
||||||
if blacklist is not None:
|
|
||||||
if wks.title == blacklist:
|
|
||||||
logger.debug(f'Ignoring worksheet {wks.title} as in blacklist')
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
|
||||||
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
|
||||||
@@ -93,7 +94,7 @@ def process_sheet(c: Config):
|
|||||||
if not is_retry: continue
|
if not is_retry: continue
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
try:
|
try:
|
||||||
gw.set_cell(row, 'status', 'Archive in progress')
|
gw.set_cell(row, 'status', 'Archive in progress')
|
||||||
url = expand_url(url)
|
url = expand_url(url)
|
||||||
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
|
||||||
@@ -109,7 +110,7 @@ def process_sheet(c: Config):
|
|||||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||||
TelegramArchiver(storage, c.webdriver),
|
TelegramArchiver(storage, c.webdriver),
|
||||||
TwitterArchiver(storage, c.webdriver),
|
TwitterArchiver(storage, c.webdriver),
|
||||||
VkArchiver(storage, c.webdriver, c.vk_config),
|
VkArchiver(storage, c.webdriver, c.vk_config),
|
||||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -118,7 +119,7 @@ def process_sheet(c: Config):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
result = archiver.download(url, check_if_exists=c.check_if_exists)
|
||||||
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result = False
|
result = False
|
||||||
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
|
||||||
|
|||||||
@@ -51,8 +51,11 @@ class Config:
|
|||||||
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
self.sheet = getattr_or(self.args, "sheet", execution.get("sheet"))
|
||||||
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
assert self.sheet is not None, "'sheet' must be provided either through command line or configuration file"
|
||||||
|
|
||||||
self.worksheet_whitelist = execution.get("worksheet_whitelist")
|
def ensure_set(l):
|
||||||
self.worksheet_blacklist = execution.get("worksheet_blacklist")
|
l = l if isinstance(l, list) else [l]
|
||||||
|
return set([x for x in l if isinstance(x, str) and len(x) > 0])
|
||||||
|
self.worksheet_allow = ensure_set(execution.get("worksheet_allow", []))
|
||||||
|
self.worksheet_block = ensure_set(execution.get("worksheet_block", []))
|
||||||
|
|
||||||
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
self.header = int(getattr_or(self.args, "header", execution.get("header", 1)))
|
||||||
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
self.storage = getattr_or(self.args, "storage", execution.get("storage", "s3"))
|
||||||
@@ -250,6 +253,8 @@ class Config:
|
|||||||
return json.dumps({
|
return json.dumps({
|
||||||
"config_file": self.config_file,
|
"config_file": self.config_file,
|
||||||
"sheet": self.sheet,
|
"sheet": self.sheet,
|
||||||
|
"worksheet_allow": list(self.worksheet_allow),
|
||||||
|
"worksheet_block": list(self.worksheet_block),
|
||||||
"storage": self.storage,
|
"storage": self.storage,
|
||||||
"header": self.header,
|
"header": self.header,
|
||||||
"check_if_exists": self.check_if_exists,
|
"check_if_exists": self.check_if_exists,
|
||||||
|
|||||||
@@ -66,12 +66,13 @@ execution:
|
|||||||
# can be overwritten with CMD --sheet=
|
# can be overwritten with CMD --sheet=
|
||||||
sheet: your-sheet-name
|
sheet: your-sheet-name
|
||||||
|
|
||||||
# only check this worksheet rather than iterating through all worksheets in the spreadsheet. If whitelist is used then blacklist is ignored as whitelist is more restrictive.
|
# block or allow worksheets by name, instead of defaulting to checking all worksheets in a Spreadsheet
|
||||||
# worksheet_whitelist: Sheet1
|
# worksheet_allow and worksheet_block can be single values or lists
|
||||||
|
# if worksheet_allow is specified, worksheet_block is ignored
|
||||||
# worksheet to blacklist. Leave blank which is default for none. Useful if users want a MASTERSHEET exact copy of the working worksheet
|
# worksheet_allow:
|
||||||
# worksheet_blacklist: MASTERSHEET
|
# - Sheet1
|
||||||
|
# - "Sheet 2"
|
||||||
|
# worksheet_block: BlockedSheet
|
||||||
|
|
||||||
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
# which row of your tabs contains the header, can be overwritten with CMD --header=
|
||||||
header: 1
|
header: 1
|
||||||
|
|||||||
Reference in New Issue
Block a user