From 024fe58377582ff13cc5f5ca6617bf94d97ec378 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 13:33:12 +0000 Subject: [PATCH] fix config parsing in manifests, remove module level configs --- src/auto_archiver/modules/atlos/atlos.py | 4 --- .../modules/atlos_db/atlos_db.py | 5 --- .../modules/atlos_feeder/atlos_feeder.py | 5 --- .../modules/cli_feeder/cli_feeder.py | 10 ------ .../modules/csv_feeder/csv_feeder.py | 17 ---------- .../modules/gdrive_storage/gdrive_storage.py | 10 ------ .../modules/gsheet_db/__manifest__.py | 2 +- .../modules/gsheet_feeder/gsheet_feeder.py | 21 ------------- .../modules/html_formatter/html_formatter.py | 6 ---- .../modules/local_storage/local.py | 9 ------ src/auto_archiver/modules/s3_storage/s3.py | 21 ------------- .../screenshot_enricher.py | 15 --------- .../timestamping_enricher/__manifest__.py | 27 ++++++++++++---- .../timestamping_enricher.py | 31 ------------------- .../twitter_api_extractor/__manifest__.py | 2 +- src/auto_archiver/utils/gsheet.py | 29 ----------------- 16 files changed, 23 insertions(+), 191 deletions(-) diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py index 0b16714..6a175d3 100644 --- a/src/auto_archiver/modules/atlos/atlos.py +++ b/src/auto_archiver/modules/atlos/atlos.py @@ -15,10 +15,6 @@ class AtlosStorage(Storage): def __init__(self, config: dict) -> None: super().__init__(config) - @staticmethod - def configs() -> dict: - return dict(Storage.configs(), **get_atlos_config_options()) - def get_cdn_url(self, _media: Media) -> str: # It's not always possible to provide an exact URL, because it's # possible that the media once uploaded could have been copied to diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py index c1d20a1..2e24491 100644 --- a/src/auto_archiver/modules/atlos_db/atlos_db.py +++ b/src/auto_archiver/modules/atlos_db/atlos_db.py @@ -22,11 +22,6 @@ class AtlosDb(Database): # without this STEP.__init__ is not called super().__init__(config) - # TODO - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - def failed(self, item: Metadata, reason: str) -> None: """Update DB accordingly for failure""" # If the item has no Atlos ID, there's nothing for us to do diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py index 8a4a31a..262f21b 100644 --- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py +++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py @@ -15,11 +15,6 @@ class AtlosFeeder(Feeder): if type(self.api_token) != str: raise Exception("Atlos Feeder did not receive an Atlos API token") - # TODO - @staticmethod - def configs() -> dict: - return get_atlos_config_options() - def __iter__(self) -> Metadata: # Get all the urls from the Atlos API count = 0 diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 3380f90..7d0d01f 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -13,16 +13,6 @@ class CLIFeeder(Feeder): if type(self.urls) != list or len(self.urls) == 0: raise Exception("CLI Feeder did not receive any URL to process") - # @staticmethod - # def configs() -> dict: - # return { - # "urls": { - # "default": None, - # "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - # "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(","))) - # }, - # } - def __iter__(self) -> Metadata: for url in self.urls: logger.debug(f"Processing {url}") diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py index 91a2b97..7bff16e 100644 --- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py +++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py @@ -9,23 +9,6 @@ class CSVFeeder(Feeder): name = "csv_feeder" - - @staticmethod - def configs() -> dict: - return { - "files": { - "default": None, - "help": "Path to the input file(s) to read the URLs from, comma separated. \ - Input files should be formatted with one URL per line", - "type": "auto_archiver.utils.parse_csv_to_set", - }, - "column": { - "default": None, - "help": "Column number or name to read the URLs from, 0-indexed", - } - } - - def __iter__(self) -> Metadata: url_column = self.column or 0 for file in self.files: diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py index 652ff91..4bcdb90 100644 --- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py +++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py @@ -58,16 +58,6 @@ class GDriveStorage(Storage): self.service = build('drive', 'v3', credentials=creds) - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"}, - "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."}, - }) - def get_cdn_url(self, media: Media) -> str: """ only support files saved in a folder for GD diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index 2f4f9b4..edc8d24 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -14,7 +14,7 @@ "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": auto_archiver.utils.parse_csv_to_set, + "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index a417615..01cd3b3 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder): super().__init__(config) self.gsheets_client = gspread.service_account(filename=self.service_account) - # @staticmethod - # def configs() -> dict: - # return dict( - # Gsheets.configs(), - # ** { - # "allow_worksheets": { - # "default": set(), - # "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # }, - # "block_worksheets": { - # "default": set(), - # "help": "(CSV) explicitly block some worksheets from being processed", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # }, - # "use_sheet_names_in_stored_paths": { - # "default": True, - # "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", - # } - # }) - def __iter__(self) -> Metadata: sh = self.open_sheet() for ii, wks in enumerate(sh.worksheets()): diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py index a1951f3..15104b2 100644 --- a/src/auto_archiver/modules/html_formatter/html_formatter.py +++ b/src/auto_archiver/modules/html_formatter/html_formatter.py @@ -28,12 +28,6 @@ class HtmlFormatter(Formatter): }) self.template = self.environment.get_template("html_template.html") - # @staticmethod - # def configs() -> dict: - # return { - # "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"} - # } - def format(self, item: Metadata) -> Media: url = item.get_url() if item.is_empty(): diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local.py index cac692e..530f111 100644 --- a/src/auto_archiver/modules/local_storage/local.py +++ b/src/auto_archiver/modules/local_storage/local.py @@ -15,15 +15,6 @@ class LocalStorage(Storage): super().__init__(config) os.makedirs(self.save_to, exist_ok=True) - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "save_to": {"default": "./archived", "help": "folder where to save archived content"}, - "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, - }) - def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py index fe221d0..a637259 100644 --- a/src/auto_archiver/modules/s3_storage/s3.py +++ b/src/auto_archiver/modules/s3_storage/s3.py @@ -26,27 +26,6 @@ class S3Storage(Storage): if self.random_no_duplicate: logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.") - @staticmethod - def configs() -> dict: - return dict( - Storage.configs(), - ** { - "bucket": {"default": None, "help": "S3 bucket name"}, - "region": {"default": None, "help": "S3 region name"}, - "key": {"default": None, "help": "S3 API key"}, - "secret": {"default": None, "help": "S3 API secret"}, - "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, - "endpoint_url": { - "default": 'https://{region}.digitaloceanspaces.com', - "help": "S3 bucket endpoint, {region} are inserted at runtime" - }, - "cdn_url": { - "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', - "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" - }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, - }) - def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py index 626cd1f..0140875 100644 --- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py +++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py @@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - # TODO? - - - - # @staticmethod - # def configs() -> dict: - # return { - # "width": {"default": 1280, "help": "width of the screenshots"}, - # "height": {"default": 720, "help": "height of the screenshots"}, - # "timeout": {"default": 60, "help": "timeout for taking the screenshot"}, - # "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"}, - # "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"}, - # "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"}, - # "print_options": {"default": {}, "help": "options to pass to the pdf printer"} - # } def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index 904fde6..e4ac925 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -15,13 +15,28 @@ "configs": { "tsa_urls": { "default": [ - "http://timestamp.digicert.com", - "http://timestamp.identrust.com", - "http://timestamp.globalsign.com/tsa/r6advanced1", - "http://tss.accv.es:8318/tsa" - ], + # [Adobe Approved Trust List] and [Windows Cert Store] + "http://timestamp.digicert.com", + "http://timestamp.identrust.com", + # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping + # "https://timestamp.sectigo.com", # wait 15 seconds between each request. + + # [Adobe: European Union Trusted Lists]. + # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. + + # [Windows Cert Store] + "http://timestamp.globalsign.com/tsa/r6advanced1", + # [Adobe: European Union Trusted Lists] and [Windows Cert Store] + # "http://ts.quovadisglobal.com/eu", # not valid for timestamping + # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain + # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain + # "http://tsa.sep.bg", # self-signed certificate in certificate chain + # "http://tsa.izenpe.com", #unable to get local issuer certificate + # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate + "http://tss.accv.es:8318/tsa", + ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "type": auto_archiver.utils.parse_csv_to_set, + "type": "auto_archiver.utils.parse_csv_to_set", } }, "description": """ diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py index 0e159fa..c90d42c 100644 --- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py +++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py @@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher): def __init__(self, config: dict) -> None: super().__init__(config) - # @staticmethod - # def configs() -> dict: - # return { - # "tsa_urls": { - # "default": [ - # # [Adobe Approved Trust List] and [Windows Cert Store] - # "http://timestamp.digicert.com", - # "http://timestamp.identrust.com", - # # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping - # # "https://timestamp.sectigo.com", # wait 15 seconds between each request. - # - # # [Adobe: European Union Trusted Lists]. - # # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request. - # - # # [Windows Cert Store] - # "http://timestamp.globalsign.com/tsa/r6advanced1", - # - # # [Adobe: European Union Trusted Lists] and [Windows Cert Store] - # # "http://ts.quovadisglobal.com/eu", # not valid for timestamping - # # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain - # # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain - # # "http://tsa.sep.bg", # self-signed certificate in certificate chain - # # "http://tsa.izenpe.com", #unable to get local issuer certificate - # # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate - # "http://tss.accv.es:8318/tsa", - # ], - # "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - # "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) - # } - # } - def enrich(self, to_enrich: Metadata) -> None: url = to_enrich.get_url() logger.debug(f"RFC3161 timestamping existing files for {url=}") diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 239a0bb..6e64269 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "type": auto_archiver.utils.parse_csv_to_set,}, + "type": "auto_archiver.utils.parse_csv_to_set",}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py index f84aab2..78f01c5 100644 --- a/src/auto_archiver/utils/gsheet.py +++ b/src/auto_archiver/utils/gsheet.py @@ -16,35 +16,6 @@ class Gsheets(Step): assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}" assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets." - @staticmethod - def configs() -> dict: - return { - "sheet": {"default": None, "help": "name of the sheet to archive"}, - "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, - "header": {"default": 1, "help": "index of the header row (starts at 1)"}, - "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, - "columns": { - "default": { - 'url': 'link', - 'status': 'archive status', - 'folder': 'destination folder', - 'archive': 'archive location', - 'date': 'archive date', - 'thumbnail': 'thumbnail', - 'timestamp': 'upload timestamp', - 'title': 'upload title', - 'text': 'text content', - 'screenshot': 'screenshot', - 'hash': 'hash', - 'pdq_hash': 'perceptual hashes', - 'wacz': 'wacz', - 'replaywebpage': 'replaywebpage', - }, - "help": "names of columns in the google sheet (stringified JSON object)", - "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val)) - }, - } - def open_sheet(self): if self.sheet: return self.gsheets_client.open(self.sheet)