From dd402b456f748173e5fe83581ea3e1331f8b9183 Mon Sep 17 00:00:00 2001
From: erinhmclark <erinhannahmary.clark@gmail.com>
Date: Fri, 24 Jan 2025 18:50:11 +0000
Subject: [PATCH] Fix and add types to manifest

---
 README.md                                     |  2 +-
 .../modules/api_db/__manifest__.py            |  8 ++-
 .../modules/cli_feeder/__manifest__.py        |  1 -
 .../modules/csv_feeder/__manifest__.py        |  1 -
 .../modules/gsheet_db/__manifest__.py         |  2 -
 .../modules/gsheet_db/gsheet_db.py            |  2 +-
 .../modules/gsheet_feeder/__init__.py         |  1 +
 .../modules/gsheet_feeder/__manifest__.py     | 31 +++++++++--
 .../modules/gsheet_feeder/gsheet_feeder.py    | 53 ++++++++++++++++---
 .../gsheet_feeder}/gworksheet.py              |  0
 .../timestamping_enricher/__manifest__.py     |  1 -
 .../twitter_api_extractor/__manifest__.py     |  2 +-
 src/auto_archiver/utils/misc.py               |  4 --
 13 files changed, 80 insertions(+), 28 deletions(-)
 rename src/auto_archiver/{utils => modules/gsheet_feeder}/gworksheet.py (100%)

diff --git a/README.md b/README.md
index 1bd6ddd..c52c464 100644
--- a/README.md
+++ b/README.md
@@ -218,7 +218,7 @@ configurations:
 ## Running on Google Sheets Feeder (gsheet_feeder)
 The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. 
 This sheet must have been shared with the Google Service account used by `gspread`. 
-This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:
+This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is:
 
 Inputs:
 
diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py
index 4c85541..c89165f 100644
--- a/src/auto_archiver/modules/api_db/__manifest__.py
+++ b/src/auto_archiver/modules/api_db/__manifest__.py
@@ -13,11 +13,9 @@
             "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
             "author_id": {"default": None, "help": "which email to assign as author"},
             "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
-            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
-            "store_results": {"default": True, "help": "when set, will send the results to the API database."},
-            "tags": {"default": [], "help": "what tags to add to the archived URL",
-                     "type": "auto_archiver.utils.parse_csv_to_set",
-                     }
+            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
+            "store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
+            "tags": {"default": [], "help": "what tags to add to the archived URL",}
         },
     "description": """
      Provides integration with the Auto-Archiver API for querying and storing archival data.
diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py
index 6f62cd2..febebd0 100644
--- a/src/auto_archiver/modules/cli_feeder/__manifest__.py
+++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py
@@ -9,7 +9,6 @@
         "urls": {
             "default": None,
             "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         },
     },
     "description": """
diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py
index 7e84a43..4d19b70 100644
--- a/src/auto_archiver/modules/csv_feeder/__manifest__.py
+++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py
@@ -11,7 +11,6 @@
                 "default": None,
                 "help": "Path to the input file(s) to read the URLs from, comma separated. \
                         Input files should be formatted with one URL per line",
-                "type": "auto_archiver.utils.parse_csv_to_set",
             },
             "column": {
                 "default": None,
diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py
index edc8d24..8c54fe5 100644
--- a/src/auto_archiver/modules/gsheet_db/__manifest__.py
+++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py
@@ -9,12 +9,10 @@
         "allow_worksheets": {
             "default": set(),
             "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         },
         "block_worksheets": {
             "default": set(),
             "help": "(CSV) explicitly block some worksheets from being processed",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         },
         "use_sheet_names_in_stored_paths": {
             "default": True,
diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
index 9ed3642..239bc06 100644
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@@ -7,7 +7,7 @@ from loguru import logger
 
 from auto_archiver.base_processors import Database
 from auto_archiver.core import Metadata, Media, ArchivingContext
-from auto_archiver.utils import GWorksheet
+from auto_archiver.modules.gsheet_feeder import GWorksheet
 
 
 class GsheetsDb(Database):
diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py
index f122bb2..bb4230a 100644
--- a/src/auto_archiver/modules/gsheet_feeder/__init__.py
+++ b/src/auto_archiver/modules/gsheet_feeder/__init__.py
@@ -1 +1,2 @@
+from .gworksheet import GWorksheet
 from .gsheet_feeder import GsheetsFeeder
\ No newline at end of file
diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
index cb58035..685a8fd 100644
--- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
+++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py
@@ -1,25 +1,48 @@
 {
-    "name": "Google Sheets Procesor",
+    "name": "Google Sheets Feeder",
     "type": ["feeder"],
-    "entry_point": "gsheet_feeder::GsheetsFeeder",
+    "entry_point": "GsheetsFeeder",
     "requires_setup": True,
     "external_dependencies": {
         "python": ["loguru", "gspread", "python-slugify"],
     },
     "configs": {
+            "sheet": {"default": None, "help": "name of the sheet to archive"},
+            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
+            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
+            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
+            "columns": {
+                "default": {
+                    'url': 'link',
+                    'status': 'archive status',
+                    'folder': 'destination folder',
+                    'archive': 'archive location',
+                    'date': 'archive date',
+                    'thumbnail': 'thumbnail',
+                    'timestamp': 'upload timestamp',
+                    'title': 'upload title',
+                    'text': 'text content',
+                    'screenshot': 'screenshot',
+                    'hash': 'hash',
+                    'pdq_hash': 'perceptual hashes',
+                    'wacz': 'wacz',
+                    'replaywebpage': 'replaywebpage',
+                },
+                "help": "names of columns in the google sheet (stringified JSON object)",
+                "type": "auto_archiver.utils.json_loader",
+            },
         "allow_worksheets": {
             "default": set(),
             "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         },
         "block_worksheets": {
             "default": set(),
             "help": "(CSV) explicitly block some worksheets from being processed",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         },
         "use_sheet_names_in_stored_paths": {
             "default": True,
             "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
+            "type": "bool",
         }
     },
     "description": """
diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
index 01cd3b3..321711e 100644
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -8,23 +8,62 @@ The filtered rows are processed into `Metadata` objects.
 - validates the sheet's structure and filters rows based on input configurations.
 - Ensures only rows with valid URLs and unprocessed statuses are included.
 """
-import gspread, os
+import os
+import gspread
 
 from loguru import logger
 from slugify import slugify
 
 from auto_archiver.base_processors import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
-from auto_archiver.utils import Gsheets, GWorksheet
+from . import GWorksheet
 
 
-class GsheetsFeeder(Gsheets, Feeder):
+class GsheetsFeeder(Feeder):
     name = "gsheet_feeder"
 
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.gsheets_client = gspread.service_account(filename=self.service_account)
+    # def __init__(self, config: dict) -> None:
+    #     """
+    #     Initializes the GsheetsFeeder with preloaded configurations.
+    #     """
+    #     super().__init__(config)
+    #     # Initialize the gspread client with the provided service account file
+    #     self.gsheets_client = gspread.service_account(filename=config["service_account"])
+    #
+    #     # Set up feeder-specific configurations from the config
+    #     self.sheet_name = config.get("sheet")
+    #     self.sheet_id = config.get("sheet_id")
+    #     self.header = config.get("header", 1)
+    #     self.columns = config.get("columns", {})
+    #     assert self.sheet_name or self.sheet_id, (
+    #         "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
+    #     )
+
+
+        # # Configuration attributes
+        # self.sheet = config.get("sheet")
+        # self.sheet_id = config.get("sheet_id")
+        # self.header = config.get("header", 1)
+        # self.columns = config.get("columns", {})
+        # self.allow_worksheets = config.get("allow_worksheets", set())
+        # self.block_worksheets = config.get("block_worksheets", set())
+        # self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
+
+        # Ensure the header is an integer
+    #     try:
+    #         self.header = int(self.header)
+    #     except ValueError:
+    #         pass
+    #     assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
+    #     assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
+    #
+
+    def open_sheet(self):
+        if self.sheet:
+            return self.gsheets_client.open(self.sheet)
+        else:  # self.sheet_id
+            return self.gsheets_client.open_by_key(self.sheet_id)
+
 
     def __iter__(self) -> Metadata:
         sh = self.open_sheet()
diff --git a/src/auto_archiver/utils/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py
similarity index 100%
rename from src/auto_archiver/utils/gworksheet.py
rename to src/auto_archiver/modules/gsheet_feeder/gworksheet.py
diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
index e4ac925..496d211 100644
--- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
+++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py
@@ -36,7 +36,6 @@
                     "http://tss.accv.es:8318/tsa",
                 ],
             "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
-            "type": "auto_archiver.utils.parse_csv_to_set",
         }
     },
     "description": """
diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
index 6e64269..02d0d6c 100644
--- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py
@@ -12,7 +12,7 @@
     "configs": {
             "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
             "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
-                              "type": "auto_archiver.utils.parse_csv_to_set",},
+                              },
             "consumer_key": {"default": None, "help": "twitter API consumer_key"},
             "consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
             "access_token": {"default": None, "help": "twitter API access_token"},
diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py
index ad16401..e985e3e 100644
--- a/src/auto_archiver/utils/misc.py
+++ b/src/auto_archiver/utils/misc.py
@@ -55,9 +55,5 @@ def random_str(length: int = 32) -> str:
     assert length <= 32, "length must be less than 32 as UUID4 is used"
     return str(uuid.uuid4()).replace("-", "")[:length]
 
-
-def parse_csv_to_set(cli_val, cur_val):
-    return set(cli_val.split(","))
-
 def json_loader(cli_val):
     return json.loads(cli_val)