mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Fix and add types to manifest
This commit is contained in:
@@ -13,11 +13,9 @@
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
}
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived", "type": "bool",},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database.", "type": "bool",},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL",}
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
"urls": {
|
||||
"default": None,
|
||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
"default": None,
|
||||
"help": "Path to the input file(s) to read the URLs from, comma separated. \
|
||||
Input files should be formatted with one URL per line",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"column": {
|
||||
"default": None,
|
||||
|
||||
@@ -9,12 +9,10 @@
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
|
||||
@@ -7,7 +7,7 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.base_processors import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.utils import GWorksheet
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
|
||||
|
||||
class GsheetsDb(Database):
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
from .gworksheet import GWorksheet
|
||||
from .gsheet_feeder import GsheetsFeeder
|
||||
@@ -1,25 +1,48 @@
|
||||
{
|
||||
"name": "Google Sheets Procesor",
|
||||
"name": "Google Sheets Feeder",
|
||||
"type": ["feeder"],
|
||||
"entry_point": "gsheet_feeder::GsheetsFeeder",
|
||||
"entry_point": "GsheetsFeeder",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "gspread", "python-slugify"],
|
||||
},
|
||||
"configs": {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"type": "auto_archiver.utils.json_loader",
|
||||
},
|
||||
"allow_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) only worksheets whose name is included in allow are included (overrides worksheet_block), leave empty so all are allowed",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"block_worksheets": {
|
||||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
},
|
||||
"use_sheet_names_in_stored_paths": {
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
"type": "bool",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -8,23 +8,62 @@ The filtered rows are processed into `Metadata` objects.
|
||||
- validates the sheet's structure and filters rows based on input configurations.
|
||||
- Ensures only rows with valid URLs and unprocessed statuses are included.
|
||||
"""
|
||||
import gspread, os
|
||||
import os
|
||||
import gspread
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from auto_archiver.base_processors import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.utils import Gsheets, GWorksheet
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
class GsheetsFeeder(Gsheets, Feeder):
|
||||
class GsheetsFeeder(Feeder):
|
||||
name = "gsheet_feeder"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# def __init__(self, config: dict) -> None:
|
||||
# """
|
||||
# Initializes the GsheetsFeeder with preloaded configurations.
|
||||
# """
|
||||
# super().__init__(config)
|
||||
# # Initialize the gspread client with the provided service account file
|
||||
# self.gsheets_client = gspread.service_account(filename=config["service_account"])
|
||||
#
|
||||
# # Set up feeder-specific configurations from the config
|
||||
# self.sheet_name = config.get("sheet")
|
||||
# self.sheet_id = config.get("sheet_id")
|
||||
# self.header = config.get("header", 1)
|
||||
# self.columns = config.get("columns", {})
|
||||
# assert self.sheet_name or self.sheet_id, (
|
||||
# "You need to define either a 'sheet' name or a 'sheet_id' in your manifest."
|
||||
# )
|
||||
|
||||
|
||||
# # Configuration attributes
|
||||
# self.sheet = config.get("sheet")
|
||||
# self.sheet_id = config.get("sheet_id")
|
||||
# self.header = config.get("header", 1)
|
||||
# self.columns = config.get("columns", {})
|
||||
# self.allow_worksheets = config.get("allow_worksheets", set())
|
||||
# self.block_worksheets = config.get("block_worksheets", set())
|
||||
# self.use_sheet_names_in_stored_paths = config.get("use_sheet_names_in_stored_paths", True)
|
||||
|
||||
# Ensure the header is an integer
|
||||
# try:
|
||||
# self.header = int(self.header)
|
||||
# except ValueError:
|
||||
# pass
|
||||
# assert isinstance(self.header, int), f"Header must be an integer, got {type(self.header)}"
|
||||
# assert self.sheet or self.sheet_id, "Either 'sheet' or 'sheet_id' must be defined."
|
||||
#
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
sh = self.open_sheet()
|
||||
|
||||
108
src/auto_archiver/modules/gsheet_feeder/gworksheet.py
Normal file
108
src/auto_archiver/modules/gsheet_feeder/gworksheet.py
Normal file
@@ -0,0 +1,108 @@
|
||||
from gspread import utils
|
||||
|
||||
|
||||
class GWorksheet:
|
||||
"""
|
||||
This class makes read/write operations to the a worksheet easier.
|
||||
It can read the headers from a custom row number, but the row references
|
||||
should always include the offset of the header.
|
||||
eg: if header=4, row 5 will be the first with data.
|
||||
"""
|
||||
COLUMN_NAMES = {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
self.wks = worksheet
|
||||
self.columns = columns
|
||||
self.values = self.wks.get_values()
|
||||
if len(self.values) > 0:
|
||||
self.headers = [v.lower() for v in self.values[header_row - 1]]
|
||||
else:
|
||||
self.headers = []
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col].lower())
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col].lower() in self.headers
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.values)
|
||||
|
||||
def get_row(self, row: int):
|
||||
# row is 1-based
|
||||
return self.values[row - 1]
|
||||
|
||||
def get_values(self):
|
||||
return self.values
|
||||
|
||||
def get_cell(self, row, col: str, fresh=False):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
where row can be an index (1-based) OR list of values
|
||||
as received from self.get_row(row)
|
||||
if fresh=True, the sheet is queried again for this cell
|
||||
"""
|
||||
col_index = self._col_index(col)
|
||||
|
||||
if fresh:
|
||||
return self.wks.cell(row, col_index + 1).value
|
||||
if type(row) == int:
|
||||
row = self.get_row(row)
|
||||
|
||||
if col_index >= len(row):
|
||||
return ''
|
||||
return row[col_index]
|
||||
|
||||
def get_cell_or_default(self, row, col: str, default: str = None, fresh=False, when_empty_use_default=True):
|
||||
"""
|
||||
return self.get_cell or default value on error (eg: column is missing)
|
||||
"""
|
||||
try:
|
||||
val = self.get_cell(row, col, fresh)
|
||||
if when_empty_use_default and val.strip() == "":
|
||||
return default
|
||||
return val
|
||||
except:
|
||||
return default
|
||||
|
||||
def set_cell(self, row: int, col: str, val):
|
||||
# row is 1-based
|
||||
col_index = self._col_index(col) + 1
|
||||
self.wks.update_cell(row, col_index, val)
|
||||
|
||||
def batch_set_cell(self, cell_updates):
|
||||
"""
|
||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, col),
|
||||
'values': [[str(val)[0:49999]]]
|
||||
}
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
|
||||
def to_a1(self, row: int, col: str):
|
||||
# row is 1-based
|
||||
return utils.rowcol_to_a1(row, self._col_index(col) + 1)
|
||||
@@ -36,7 +36,6 @@
|
||||
"http://tss.accv.es:8318/tsa",
|
||||
],
|
||||
"help": "List of RFC3161 Time Stamp Authorities to use, separate with commas if passed via the command line.",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",
|
||||
}
|
||||
},
|
||||
"description": """
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"configs": {
|
||||
"bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"},
|
||||
"bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line",
|
||||
"type": "auto_archiver.utils.parse_csv_to_set",},
|
||||
},
|
||||
"consumer_key": {"default": None, "help": "twitter API consumer_key"},
|
||||
"consumer_secret": {"default": None, "help": "twitter API consumer_secret"},
|
||||
"access_token": {"default": None, "help": "twitter API access_token"},
|
||||
|
||||
Reference in New Issue
Block a user