Fix and add types to manifest

This commit is contained in:
erinhmclark
2025-01-24 18:50:11 +00:00
parent 1942e8b819
commit dd402b456f
13 changed files with 80 additions and 28 deletions

View File

@@ -218,7 +218,7 @@ configurations:
## Running on Google Sheets Feeder (gsheet_feeder)
The `--gsheet_feeder.sheet` property is the name of the Google Sheet to check for URLs.
This sheet must have been shared with the Google Service account used by `gspread`.
This sheet must also have specific columns (case-insensitive) in the `header` as specified in [Gsheet.configs](src/auto_archiver/utils/gsheet.py). The default names of these columns and their purpose is:
This sheet must also have specific columns (case-insensitive) in the `header` as specified in [gsheet_feeder.__manifest__.py](src/auto_archiver/modules/gsheet_feeder/__manifest__.py). The default names of these columns and their purpose is:
Inputs:

View File

@@ -13,11 +13,9 @@
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
"tags": {"default": [], "help": "what tags to add to the archived URL",
"type": "auto_archiver.utils.parse_csv_to_set",
}
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
"store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
"tags": {"default": [], "help": "what tags to add to the archived URL",}
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.

View File

@@ -9,7 +9,6 @@
"urls": {
"default": None,
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"type": "auto_archiver.utils.parse_csv_to_set",
},
},
"description": """

View File

@@ -11,7 +11,6 @@
"default": None,
"help": "Path to the input file(s) to read the URLs from, comma separated. \
Input files should be formatted with one URL per line",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"column": {
"default": None,

View File

@@ -9,12 +9,10 @@
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"use_sheet_names_in_stored_paths": {
"default": True,

View File

@@ -7,7 +7,7 @@ from loguru import logger
from auto_archiver.base_processors import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.utils import GWorksheet
from auto_archiver.modules.gsheet_feeder import GWorksheet
class GsheetsDb(Database):

View File

@@ -1 +1,2 @@
from .gworksheet import GWorksheet
from .gsheet_feeder import GsheetsFeeder

View File

@@ -1,25 +1,48 @@
{
"name": "Google Sheets Procesor",
"name": "Google Sheets Feeder",
"type": ["feeder"],
"entry_point": "gsheet_feeder::GsheetsFeeder",
"entry_point": "GsheetsFeeder",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],
},
"configs": {
"sheet": {"default": None, "help": "name of the sheet to archive"},
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
"columns": {
"default": {
'url': 'link',
'status': 'archive status',
'folder': 'destination folder',
'archive': 'archive location',
'date': 'archive date',
'thumbnail': 'thumbnail',
'timestamp': 'upload timestamp',
'title': 'upload title',
'text': 'text content',
'screenshot': 'screenshot',
'hash': 'hash',
'pdq_hash': 'perceptual hashes',
'wacz': 'wacz',
'replaywebpage': 'replaywebpage',
},
"help": "names of columns in the google sheet (stringified JSON object)",
"type": "auto_archiver.utils.json_loader",
},
"allow_worksheets": {
"default": set(),
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"block_worksheets": {
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"type": "auto_archiver.utils.parse_csv_to_set",
},
"use_sheet_names_in_stored_paths": {
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
"type": "bool",
}
},
"description": """

View File

@@ -8,23 +8,62 @@ The filtered rows are processed into `Metadata` objects.
- validates the sheet's structure and filters rows based on input configurations.
- Ensures only rows with valid URLs and unprocessed statuses are included.
"""
import gspread, os
import os
import gspread
from loguru import logger
from slugify import slugify
from auto_archiver.base_processors import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.utils import Gsheets, GWorksheet
from . import GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
class GsheetsFeeder(Feeder):
name = "gsheet_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.gsheets_client = gspread.service_account(filename=self.service_account)
# def __init__(self, config: dict) -> None:
# """
# Initializes the GsheetsFeeder with preloaded configurations.
# """
# super().__init__(config)
# # Initialize the gspread client with the provided service account file
# self.gsheets_client = gspread.service_account(filename=config["service_account"])
#
# # Set up feeder-specific configurations from the config
# self.sheet_name = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# assert self.sheet_name or self.sheet_id, (
# "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
# )
# # Configuration attributes
# self.sheet = config.get("sheet")
# self.sheet_id = config.get("sheet_id")
# self.header = config.get("header", 1)
# self.columns = config.get("columns", {})
# self.allow_worksheets = config.get("allow_worksheets", set())
# self.block_worksheets = config.get("block_worksheets", set())
# self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
# Ensure the header is an integer
# try:
# self.header = int(self.header)
# except ValueError:
# pass
# assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
# assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
#
def open_sheet(self):
if self.sheet:
return self.gsheets_client.open(self.sheet)
else: # self.sheet_id
return self.gsheets_client.open_by_key(self.sheet_id)
def __iter__(self) -> Metadata:
sh = self.open_sheet()

View File

@@ -36,7 +36,6 @@
"http://tss.accv.es:8318/tsa",
],
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
"type": "auto_archiver.utils.parse_csv_to_set",
}
},
"description": """

View File

@@ -12,7 +12,7 @@
"configs": {
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
"type": "auto_archiver.utils.parse_csv_to_set",},
},
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
"access_token": {"default": None, "help": "twitter API access_token"},

View File

@@ -55,9 +55,5 @@ def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]
def parse_csv_to_set(cli_val, cur_val):
return set(cli_val.split(","))
def json_loader(cli_val):
return json.loads(cli_val)