From dd402b456f748173e5fe83581ea3e1331f8b9183 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Fri, 24 Jan 2025 18:50:11 +0000 Subject: [PATCH] Fix and add types to manifest --- README.md | 2 +- .../modules/api_db/__manifest__.py | 8 ++- .../modules/cli_feeder/__manifest__.py | 1 - .../modules/csv_feeder/__manifest__.py | 1 - .../modules/gsheet_db/__manifest__.py | 2 - .../modules/gsheet_db/gsheet_db.py | 2 +- .../modules/gsheet_feeder/__init__.py | 1 + .../modules/gsheet_feeder/__manifest__.py | 31 +++++++++-- .../modules/gsheet_feeder/gsheet_feeder.py | 53 ++++++++++++++++--- .../gsheet_feeder}/gworksheet.py | 0 .../timestamping_enricher/__manifest__.py | 1 - .../twitter_api_extractor/__manifest__.py | 2 +- src/auto_archiver/utils/misc.py | 4 -- 13 files changed, 80 insertions(+), 28 deletions(-) rename src/auto_archiver/{utils => modules/gsheet_feeder}/gworksheet.py (100%) diff --git a/README.md b/README.md index 1bd6ddd..c52c464 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ configurations: ## Running on Google Sheets Feeder (gsheet_feeder) The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs. This sheet must have been shared with the Google Service account used by `gspread`. -This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is: +This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is: Inputs: diff --git a/src/auto_archiver/modules/api_db/__manifest__.py b/src/auto_archiver/modules/api_db/__manifest__.py index 4c85541..c89165f 100644 --- a/src/auto_archiver/modules/api_db/__manifest__.py +++ b/src/auto_archiver/modules/api_db/__manifest__.py @@ -13,11 +13,9 @@ "public": {"default": False, "help": "whether the URL should be publicly available via the API"}, "author_id": {"default": None, "help": "which email to assign as author"}, "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"}, - "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"}, - "store_results": {"default": True, "help": "when set, will send the results to the API database."}, - "tags": {"default": [], "help": "what tags to add to the archived URL", - "type": "auto_archiver.utils.parse_csv_to_set", - } + "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",}, + "store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",}, + "tags": {"default": [], "help": "what tags to add to the archived URL",} }, "description": """ Provides integration with the Auto-Archiver API for querying and storing archival data. diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 6f62cd2..febebd0 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -9,7 +9,6 @@ "urls": { "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", - "type": "auto_archiver.utils.parse_csv_to_set", }, }, "description": """ diff --git a/src/auto_archiver/modules/csv_feeder/__manifest__.py b/src/auto_archiver/modules/csv_feeder/__manifest__.py index 7e84a43..4d19b70 100644 --- a/src/auto_archiver/modules/csv_feeder/__manifest__.py +++ b/src/auto_archiver/modules/csv_feeder/__manifest__.py @@ -11,7 +11,6 @@ "default": None, "help": "Path to the input file(s) to read the URLs from, comma separated. \ Input files should be formatted with one URL per line", - "type": "auto_archiver.utils.parse_csv_to_set", }, "column": { "default": None, diff --git a/src/auto_archiver/modules/gsheet_db/__manifest__.py b/src/auto_archiver/modules/gsheet_db/__manifest__.py index edc8d24..8c54fe5 100644 --- a/src/auto_archiver/modules/gsheet_db/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_db/__manifest__.py @@ -9,12 +9,10 @@ "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, diff --git a/src/auto_archiver/modules/gsheet_db/gsheet_db.py b/src/auto_archiver/modules/gsheet_db/gsheet_db.py index 9ed3642..239bc06 100644 --- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py +++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py @@ -7,7 +7,7 @@ from loguru import logger from auto_archiver.base_processors import Database from auto_archiver.core import Metadata, Media, ArchivingContext -from auto_archiver.utils import GWorksheet +from auto_archiver.modules.gsheet_feeder import GWorksheet class GsheetsDb(Database): diff --git a/src/auto_archiver/modules/gsheet_feeder/__init__.py b/src/auto_archiver/modules/gsheet_feeder/__init__.py index f122bb2..bb4230a 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__init__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__init__.py @@ -1 +1,2 @@ +from .gworksheet import GWorksheet from .gsheet_feeder import GsheetsFeeder \ No newline at end of file diff --git a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py index cb58035..685a8fd 100644 --- a/src/auto_archiver/modules/gsheet_feeder/__manifest__.py +++ b/src/auto_archiver/modules/gsheet_feeder/__manifest__.py @@ -1,25 +1,48 @@ { - "name": "Google Sheets Procesor", + "name": "Google Sheets Feeder", "type": ["feeder"], - "entry_point": "gsheet_feeder::GsheetsFeeder", + "entry_point": "GsheetsFeeder", "requires_setup": True, "external_dependencies": { "python": ["loguru", "gspread", "python-slugify"], }, "configs": { + "sheet": {"default": None, "help": "name of the sheet to archive"}, + "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"}, + "header": {"default": 1, "help": "index of the header row (starts at 1)"}, + "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"}, + "columns": { + "default": { + 'url': 'link', + 'status': 'archive status', + 'folder': 'destination folder', + 'archive': 'archive location', + 'date': 'archive date', + 'thumbnail': 'thumbnail', + 'timestamp': 'upload timestamp', + 'title': 'upload title', + 'text': 'text content', + 'screenshot': 'screenshot', + 'hash': 'hash', + 'pdq_hash': 'perceptual hashes', + 'wacz': 'wacz', + 'replaywebpage': 'replaywebpage', + }, + "help": "names of columns in the google sheet (stringified JSON object)", + "type": "auto_archiver.utils.json_loader", + }, "allow_worksheets": { "default": set(), "help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "block_worksheets": { "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", - "type": "auto_archiver.utils.parse_csv_to_set", }, "use_sheet_names_in_stored_paths": { "default": True, "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", + "type": "bool", } }, "description": """ diff --git a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py index 01cd3b3..321711e 100644 --- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py +++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py @@ -8,23 +8,62 @@ The filtered rows are processed into `Metadata` objects. - validates the sheet's structure and filters rows based on input configurations. - Ensures only rows with valid URLs and unprocessed statuses are included. """ -import gspread, os +import os +import gspread from loguru import logger from slugify import slugify from auto_archiver.base_processors import Feeder from auto_archiver.core import Metadata, ArchivingContext -from auto_archiver.utils import Gsheets, GWorksheet +from . import GWorksheet -class GsheetsFeeder(Gsheets, Feeder): +class GsheetsFeeder(Feeder): name = "gsheet_feeder" - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - self.gsheets_client = gspread.service_account(filename=self.service_account) + # def __init__(self, config: dict) -> None: + # """ + # Initializes the GsheetsFeeder with preloaded configurations. + # """ + # super().__init__(config) + # # Initialize the gspread client with the provided service account file + # self.gsheets_client = gspread.service_account(filename=config["service_account"]) + # + # # Set up feeder-specific configurations from the config + # self.sheet_name = config.get("sheet") + # self.sheet_id = config.get("sheet_id") + # self.header = config.get("header", 1) + # self.columns = config.get("columns", {}) + # assert self.sheet_name or self.sheet_id, ( + # "You need to define either a 'sheet' name or a 'sheet_id' in your manifest." + # ) + + + # # Configuration attributes + # self.sheet = config.get("sheet") + # self.sheet_id = config.get("sheet_id") + # self.header = config.get("header", 1) + # self.columns = config.get("columns", {}) + # self.allow_worksheets = config.get("allow_worksheets", set()) + # self.block_worksheets = config.get("block_worksheets", set()) + # self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True) + + # Ensure the header is an integer + # try: + # self.header = int(self.header) + # except ValueError: + # pass + # assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}" + # assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined." + # + + def open_sheet(self): + if self.sheet: + return self.gsheets_client.open(self.sheet) + else: # self.sheet_id + return self.gsheets_client.open_by_key(self.sheet_id) + def __iter__(self) -> Metadata: sh = self.open_sheet() diff --git a/src/auto_archiver/utils/gworksheet.py b/src/auto_archiver/modules/gsheet_feeder/gworksheet.py similarity index 100% rename from src/auto_archiver/utils/gworksheet.py rename to src/auto_archiver/modules/gsheet_feeder/gworksheet.py diff --git a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py index e4ac925..496d211 100644 --- a/src/auto_archiver/modules/timestamping_enricher/__manifest__.py +++ b/src/auto_archiver/modules/timestamping_enricher/__manifest__.py @@ -36,7 +36,6 @@ "http://tss.accv.es:8318/tsa", ], "help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.", - "type": "auto_archiver.utils.parse_csv_to_set", } }, "description": """ diff --git a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py index 6e64269..02d0d6c 100644 --- a/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_extractor/__manifest__.py @@ -12,7 +12,7 @@ "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", - "type": "auto_archiver.utils.parse_csv_to_set",}, + }, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"}, diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index ad16401..e985e3e 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -55,9 +55,5 @@ def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] - -def parse_csv_to_set(cli_val, cur_val): - return set(cli_val.split(",")) - def json_loader(cli_val): return json.loads(cli_val)