mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Context related fixes, some more tests.
This commit is contained in:
@@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder):
|
||||
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
|
||||
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
|
||||
if len(missing_cols := self.missing_required_columns(gw)):
|
||||
logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
|
||||
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
|
||||
continue
|
||||
|
||||
# process and yield metadata here:
|
||||
yield from self._process_rows(gw)
|
||||
logger.success(f'Finished worksheet {worksheet.title}')
|
||||
|
||||
def _process_rows(self, gw: GWorksheet) -> Metadata:
|
||||
def _process_rows(self, gw: GWorksheet):
|
||||
for row in range(1 + self.header, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url').strip()
|
||||
if not len(url): continue
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["boto3", "loguru"],
|
||||
"python": ["hash_enricher", "boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
@@ -49,5 +49,6 @@
|
||||
- Requires S3 credentials (API key and secret) and a bucket name to function.
|
||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||
- Uses `boto3` for interaction with the S3 API.
|
||||
- Depends on the `HashEnricher` module for hash calculation.
|
||||
"""
|
||||
}
|
||||
|
||||
@@ -9,10 +9,11 @@ from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage, HashEnricher):
|
||||
class S3Storage(Storage):
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
@@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher):
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
hd = self.calculate_hash(media.filename)
|
||||
he = get_module('hash_enricher', self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
import json, gspread
|
||||
|
||||
from ..core import BaseModule
|
||||
|
||||
|
||||
class Gsheets(BaseModule):
|
||||
name = "gsheets"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.gsheets_client = gspread.service_account(filename=self.service_account)
|
||||
# TODO: config should be responsible for conversions
|
||||
try: self.header = int(self.header)
|
||||
except: pass
|
||||
assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
|
||||
assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
|
||||
|
||||
# TODO merge this into gsheets processors manifest
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"sheet": {"default": None, "help": "name of the sheet to archive"},
|
||||
"sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
|
||||
"header": {"default": 1, "help": "index of the header row (starts at 1)"},
|
||||
"service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
|
||||
"columns": {
|
||||
"default": {
|
||||
'url': 'link',
|
||||
'status': 'archive status',
|
||||
'folder': 'destination folder',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'thumbnail': 'thumbnail',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'text': 'text content',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash',
|
||||
'pdq_hash': 'perceptual hashes',
|
||||
'wacz': 'wacz',
|
||||
'replaywebpage': 'replaywebpage',
|
||||
},
|
||||
"help": "names of columns in the google sheet (stringified JSON object)",
|
||||
"cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
|
||||
},
|
||||
}
|
||||
|
||||
def open_sheet(self):
|
||||
if self.sheet:
|
||||
return self.gsheets_client.open(self.sheet)
|
||||
else: # self.sheet_id
|
||||
return self.gsheets_client.open_by_key(self.sheet_id)
|
||||
Reference in New Issue
Block a user