Context related fixes, some more tests.

This commit is contained in:
erinhmclark
2025-02-06 16:53:00 +00:00
parent 67504a683e
commit 266c7a14e6
9 changed files with 370 additions and 227 deletions

View File

@@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder):
logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
if len(missing_cols := self.missing_required_columns(gw)):
logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
continue
# process and yield metadata here:
yield from self._process_rows(gw)
logger.success(f'Finished worksheet {worksheet.title}')
def _process_rows(self, gw: GWorksheet) -> Metadata:
def _process_rows(self, gw: GWorksheet):
for row in range(1 + self.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url').strip()
if not len(url): continue

View File

@@ -3,7 +3,7 @@
"type": ["storage"],
"requires_setup": True,
"dependencies": {
"python": ["boto3", "loguru"],
"python": ["hash_enricher", "boto3", "loguru"],
},
"configs": {
"path_generator": {
@@ -49,5 +49,6 @@
- Requires S3 credentials (API key and secret) and a bucket name to function.
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
- Uses `boto3` for interaction with the S3 API.
- Depends on the `HashEnricher` module for hash calculation.
"""
}

View File

@@ -9,10 +9,11 @@ from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage, HashEnricher):
class S3Storage(Storage):
def setup(self, config: dict) -> None:
super().setup(config)
@@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher):
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
hd = self.calculate_hash(media.filename)
he = get_module('hash_enricher', self.config)
hd = he.calculate_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):