From 024fe58377582ff13cc5f5ca6617bf94d97ec378 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Fri, 24 Jan 2025 13:33:12 +0000
Subject: [PATCH] fix config parsing in manifests, remove module level configs

---
 src/auto_archiver/modules/atlos/atlos.py      |  4 ---
 .../modules/atlos_db/atlos_db.py              |  5 ---
 .../modules/atlos_feeder/atlos_feeder.py      |  5 ---
 .../modules/cli_feeder/cli_feeder.py          | 10 ------
 .../modules/csv_feeder/csv_feeder.py          | 17 ----------
 .../modules/gdrive_storage/gdrive_storage.py  | 10 ------
 .../modules/gsheet_db/__manifest__.py         |  2 +-
 .../modules/gsheet_feeder/gsheet_feeder.py    | 21 -------------
 .../modules/html_formatter/html_formatter.py  |  6 ----
 .../modules/local_storage/local.py            |  9 ------
 src/auto_archiver/modules/s3_storage/s3.py    | 21 -------------
 .../screenshot_enricher.py                    | 15 ---------
 .../timestamping_enricher/__manifest__.py     | 27 ++++++++++++----
 .../timestamping_enricher.py                  | 31 -------------------
 .../twitter_api_extractor/__manifest__.py     |  2 +-
 src/auto_archiver/utils/gsheet.py             | 29 -----------------
 16 files changed, 23 insertions(+), 191 deletions(-)

diff --git a/src/auto_archiver/modules/atlos/atlos.py b/src/auto_archiver/modules/atlos/atlos.py
index 0b16714..6a175d3 100644
--- a/src/auto_archiver/modules/atlos/atlos.py
+++ b/src/auto_archiver/modules/atlos/atlos.py
@@ -15,10 +15,6 @@ class AtlosStorage(Storage):
     def __init__(self, config: dict) -> None:
         super().__init__(config)
 
-    @staticmethod
-    def configs() -> dict:
-        return dict(Storage.configs(), **get_atlos_config_options())
-
     def get_cdn_url(self, _media: Media) -> str:
         # It's not always possible to provide an exact URL, because it's
         # possible that the media once uploaded could have been copied to
diff --git a/src/auto_archiver/modules/atlos_db/atlos_db.py b/src/auto_archiver/modules/atlos_db/atlos_db.py
index c1d20a1..2e24491 100644
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@@ -22,11 +22,6 @@ class AtlosDb(Database):
         # without this STEP.__init__ is not called
         super().__init__(config)
 
-    # TODO
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()
-
     def failed(self, item: Metadata, reason: str) -> None:
         """Update DB accordingly for failure"""
         # If the item has no Atlos ID, there's nothing for us to do
diff --git a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
index 8a4a31a..262f21b 100644
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@@ -15,11 +15,6 @@ class AtlosFeeder(Feeder):
         if type(self.api_token) != str:
             raise Exception("Atlos Feeder did not receive an Atlos API token")
 
-    # TODO
-    @staticmethod
-    def configs() -> dict:
-        return get_atlos_config_options()
-
     def __iter__(self) -> Metadata:
         # Get all the urls from the Atlos API
         count = 0
diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
index 3380f90..7d0d01f 100644
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@@ -13,16 +13,6 @@ class CLIFeeder(Feeder):
         if type(self.urls) != list or len(self.urls) == 0:
             raise Exception("CLI Feeder did not receive any URL to process")
 
-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "urls": {
-    #             "default": None,
-    #             "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-    #             "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))
-    #         },
-    #     }
-
     def __iter__(self) -> Metadata:
         for url in self.urls:
             logger.debug(f"Processing {url}")
diff --git a/src/auto_archiver/modules/csv_feeder/csv_feeder.py b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
index 91a2b97..7bff16e 100644
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@@ -9,23 +9,6 @@ class CSVFeeder(Feeder):
 
     name = "csv_feeder"
 
-
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "files": {
-                "default": None,
-                "help": "Path to the input file(s) to read the URLs from, comma separated. \
-                        Input files should be formatted with one URL per line",
-                "type": "auto_archiver.utils.parse_csv_to_set",
-            },
-            "column": {
-                "default": None,
-                "help": "Column number or name to read the URLs from, 0-indexed",
-            }
-        }
-    
-
     def __iter__(self) -> Metadata:
         url_column = self.column or 0
         for file in self.files:
diff --git a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
index 652ff91..4bcdb90 100644
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@@ -58,16 +58,6 @@ class GDriveStorage(Storage):
 
         self.service = build('drive', 'v3', credentials=creds)
 
-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
-                "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
-                "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
-            })
-
     def get_cdn_url(self, media: Media) -> str:
         """
         only support files saved in a folder for GD
diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py
index 2f4f9b4..edc8d24 100644
--- a/src/auto_archiver/modules/gsheet_db/__manifest__.py
+++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py
@@ -14,7 +14,7 @@
         "block_worksheets": {
             "default": set(),
             "help": "(CSV) explicitly block some worksheets from being processed",
-            "type": auto_archiver.utils.parse_csv_to_set,
+            "type": "auto_archiver.utils.parse_csv_to_set",
         },
         "use_sheet_names_in_stored_paths": {
             "default": True,
diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index a417615..01cd3b3 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -26,27 +26,6 @@ class GsheetsFeeder(Gsheets, Feeder):
         super().__init__(config)
         self.gsheets_client = gspread.service_account(filename=self.service_account)
 
-    # @staticmethod
-    # def configs() -> dict:
-    #     return dict(
-    #         Gsheets.configs(),
-    #         ** {
-    #             "allow_worksheets": {
-    #                 "default": set(),
-    #                 "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-    #                 "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #             },
-    #             "block_worksheets": {
-    #                 "default": set(),
-    #                 "help": "(CSV) explicitly block some worksheets from being processed",
-    #                 "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #             },
-    #             "use_sheet_names_in_stored_paths": {
-    #                 "default": True,
-    #                 "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
-    #             }
-    #         })
-
     def __iter__(self) -> Metadata:
         sh = self.open_sheet()
         for ii, wks in enumerate(sh.worksheets()):
diff --git a/src/auto_archiver/modules/html_formatter/html_formatter.py b/src/auto_archiver/modules/html_formatter/html_formatter.py
index a1951f3..15104b2 100644
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@@ -28,12 +28,6 @@ class HtmlFormatter(Formatter):
         })
         self.template = self.environment.get_template("html_template.html")
 
-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
-    #     }
-
     def format(self, item: Metadata) -> Media:
         url = item.get_url()
         if item.is_empty():
diff --git a/src/auto_archiver/modules/local_storage/local.py b/src/auto_archiver/modules/local_storage/local.py
index cac692e..530f111 100644
--- a/src/auto_archiver/modules/local_storage/local.py
+++ b/src/auto_archiver/modules/local_storage/local.py
@@ -15,15 +15,6 @@ class LocalStorage(Storage):
         super().__init__(config)
         os.makedirs(self.save_to, exist_ok=True)
 
-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "save_to": {"default": "./archived", "help": "folder where to save archived content"},
-                "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
-            })
-
     def get_cdn_url(self, media: Media) -> str:
         # TODO: is this viable with Storage.configs on path/filename?
         dest = os.path.join(self.save_to, media.key)
diff --git a/src/auto_archiver/modules/s3_storage/s3.py b/src/auto_archiver/modules/s3_storage/s3.py
index fe221d0..a637259 100644
--- a/src/auto_archiver/modules/s3_storage/s3.py
+++ b/src/auto_archiver/modules/s3_storage/s3.py
@@ -26,27 +26,6 @@ class S3Storage(Storage):
         if self.random_no_duplicate:
             logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
 
-    @staticmethod
-    def configs() -> dict:
-        return dict(
-            Storage.configs(),
-            ** {
-                "bucket": {"default": None, "help": "S3 bucket name"},
-                "region": {"default": None, "help": "S3 region name"},
-                "key": {"default": None, "help": "S3 API key"},
-                "secret": {"default": None, "help": "S3 API secret"},
-                "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
-                "endpoint_url": {
-                    "default": 'https://{region}.digitaloceanspaces.com',
-                    "help": "S3 bucket endpoint, {region} are inserted at runtime"
-                },
-                "cdn_url": {
-                    "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
-                    "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
-                },
-                "private": {"default": False, "help": "if true S3 files will not be readable online"},
-            })
-
     def get_cdn_url(self, media: Media) -> str:
         return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
 
diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
index 626cd1f..0140875 100644
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -14,21 +14,6 @@ class ScreenshotEnricher(Enricher):
 
     def __init__(self, config: dict) -> None:
         super().__init__(config)
-    #     TODO?
-
-
-
-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "width": {"default": 1280, "help": "width of the screenshots"},
-    #         "height": {"default": 720, "help": "height of the screenshots"},
-    #         "timeout": {"default": 60, "help": "timeout for taking the screenshot"},
-    #         "sleep_before_screenshot": {"default": 4, "help": "seconds to wait for the pages to load before taking screenshot"},
-    #         "http_proxy": {"default": "", "help": "http proxy to use for the webdriver, eg http://proxy-user:password@proxy-ip:port"},
-    #         "save_to_pdf": {"default": False, "help": "save the page as pdf along with the screenshot. PDF saving options can be adjusted with the 'print_options' parameter"},
-    #         "print_options": {"default": {}, "help": "options to pass to the pdf printer"}
-    #     }
 
     def enrich(self, to_enrich: Metadata) -> None:
         url = to_enrich.get_url()
diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
index 904fde6..e4ac925 100644
--- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
@@ -15,13 +15,28 @@
     "configs": {
         "tsa_urls": {
             "default": [
-                "http://timestamp.digicert.com",
-                "http://timestamp.identrust.com",
-                "http://timestamp.globalsign.com/tsa/r6advanced1",
-                "http://tss.accv.es:8318/tsa"
-            ],
+                    # [Adobe Approved Trust List] and [Windows Cert Store]
+                    "http://timestamp.digicert.com",
+                    "http://timestamp.identrust.com",
+                    # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
+                    # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
+
+                    # [Adobe: European Union Trusted Lists].
+                    # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
+
+                    # [Windows Cert Store]
+                    "http://timestamp.globalsign.com/tsa/r6advanced1",
+                    # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
+                    # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
+                    # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
+                    # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
+                    # "http://tsa.sep.bg", # self-signed certificate in certificate chain
+                    # "http://tsa.izenpe.com", #unable to get local issuer certificate
+                    # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
+                    "http://tss.accv.es:8318/tsa",
+                ],
             "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
-            "type": auto_archiver.utils.parse_csv_to_set,
+            "type": "auto_archiver.utils.parse_csv_to_set",
         }
     },
     "description": """
diff --git a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
index 0e159fa..c90d42c 100644
--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@@ -26,37 +26,6 @@ class TimestampingEnricher(Enricher):
     def __init__(self, config: dict) -> None:
         super().__init__(config)
 
-    # @staticmethod
-    # def configs() -> dict:
-    #     return {
-    #         "tsa_urls": {
-    #             "default": [
-    #                 # [Adobe Approved Trust List] and [Windows Cert Store]
-    #                 "http://timestamp.digicert.com",
-    #                 "http://timestamp.identrust.com",
-    #                 # "https://timestamp.entrust.net/TSS/RFC3161sha2TS", # not valid for timestamping
-    #                 # "https://timestamp.sectigo.com", # wait 15 seconds between each request.
-    #
-    #                 # [Adobe: European Union Trusted Lists].
-    #                 # "https://timestamp.sectigo.com/qualified", # wait 15 seconds between each request.
-    #
-    #                 # [Windows Cert Store]
-    #                 "http://timestamp.globalsign.com/tsa/r6advanced1",
-    #
-    #                 # [Adobe: European Union Trusted Lists] and [Windows Cert Store]
-    #                 # "http://ts.quovadisglobal.com/eu", # not valid for timestamping
-    #                 # "http://tsa.belgium.be/connect", # self-signed certificate in certificate chain
-    #                 # "https://timestamp.aped.gov.gr/qtss", # self-signed certificate in certificate chain
-    #                 # "http://tsa.sep.bg", # self-signed certificate in certificate chain
-    #                 # "http://tsa.izenpe.com", #unable to get local issuer certificate
-    #                 # "http://kstamp.keynectis.com/KSign", # unable to get local issuer certificate
-    #                 "http://tss.accv.es:8318/tsa",
-    #             ],
-    #             "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
-    #             "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
-    #         }
-    #     }
-
     def enrich(self, to_enrich: Metadata) -> None:
         url = to_enrich.get_url()
         logger.debug(f"RFC3161 timestamping existing files for {url=}")
diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
index 239a0bb..6e64269 100644
--- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
@@ -12,7 +12,7 @@
     "configs": {
             "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
             "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
-                              "type": auto_archiver.utils.parse_csv_to_set,},
+                              "type": "auto_archiver.utils.parse_csv_to_set",},
             "consumer_key": {"default": None, "help": "twitter API consumer_key"},
             "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
             "access_token": {"default": None, "help": "twitter API access_token"},
diff --git a/src/auto_archiver/utils/gsheet.py b/src/auto_archiver/utils/gsheet.py
index f84aab2..78f01c5 100644
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@@ -16,35 +16,6 @@ class Gsheets(Step):
         assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
         assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
 
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'text': 'text content',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'pdq_hash': 'perceptual hashes',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
-                },
-                "help": "names of columns in the google sheet (stringified JSON object)",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
-
     def open_sheet(self):
         if self.sheet:
             return self.gsheets_client.open(self.sheet)