Context related fixes, some more tests.

2026-06-12 05:08:28 +03:00 · 2025-02-06 16:53:00 +00:00
parent 67504a683e
commit 266c7a14e6
9 changed files with 370 additions and 227 deletions
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@@ -44,14 +44,14 @@ class GsheetsFeeder(Feeder):
            logger.info(f'Opening worksheet {ii=}: {worksheet.title=} header={self.header}')
            gw = GWorksheet(worksheet, header_row=self.header, columns=self.columns)
            if len(missing_cols := self.missing_required_columns(gw)):
-                logger.warning(f"SKIPPED worksheet '{wks.title}' due to missing required column(s) for {missing_cols}")
+                logger.warning(f"SKIPPED worksheet '{worksheet.title}' due to missing required column(s) for {missing_cols}")
                continue

            # process and yield metadata here:
            yield from self._process_rows(gw)
            logger.success(f'Finished worksheet {worksheet.title}')

-    def _process_rows(self, gw: GWorksheet) -> Metadata:
+    def _process_rows(self, gw: GWorksheet):
        for row in range(1 + self.header, gw.count_rows() + 1):
            url = gw.get_cell(row, 'url').strip()
            if not len(url): continue
--- a/src/auto_archiver/modules/s3_storage/manifest.py
+++ b/src/auto_archiver/modules/s3_storage/manifest.py
@@ -3,7 +3,7 @@
    "type": ["storage"],
    "requires_setup": True,
    "dependencies": {
-        "python": ["boto3", "loguru"],
+        "python": ["hash_enricher", "boto3", "loguru"],
    },
    "configs": {
        "path_generator": {
@@ -49,5 +49,6 @@
    - Requires S3 credentials (API key and secret) and a bucket name to function.
    - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
    - Uses `boto3` for interaction with the S3 API.
+    - Depends on the `HashEnricher` module for hash calculation.
    """
 }
--- a/src/auto_archiver/modules/s3_storage/s3_storage.py
+++ b/src/auto_archiver/modules/s3_storage/s3_storage.py
@@ -9,10 +9,11 @@ from auto_archiver.core import Media
 from auto_archiver.core import Storage
 from auto_archiver.modules.hash_enricher import HashEnricher
 from auto_archiver.utils.misc import random_str
+from auto_archiver.core.module import get_module

 NO_DUPLICATES_FOLDER = "no-dups/"

-class S3Storage(Storage, HashEnricher):
+class S3Storage(Storage):

    def setup(self, config: dict) -> None:
        super().setup(config)
@@ -49,7 +50,8 @@ class S3Storage(Storage, HashEnricher):
    def is_upload_needed(self, media: Media) -> bool:
        if self.random_no_duplicate:
            # checks if a folder with the hash already exists, if so it skips the upload
-            hd = self.calculate_hash(media.filename)
+            he = get_module('hash_enricher', self.config)
+            hd = he.calculate_hash(media.filename)
            path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])

            if existing_key:=self.file_in_folder(path):
--- a/src/auto_archiver/utils/gsheet.py
+++ b/src/auto_archiver/utils/gsheet.py
@@ -1,53 +0,0 @@
-import json, gspread
-
-from ..core import BaseModule
-
-
-class Gsheets(BaseModule):
-    name = "gsheets"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.gsheets_client = gspread.service_account(filename=self.service_account)
-        # TODO: config should be responsible for conversions
-        try: self.header = int(self.header)
-        except: pass
-        assert type(self.header) == int, f"header ({self.header}) value must be an integer not {type(self.header)}"
-        assert self.sheet is not None or self.sheet_id is not None, "You need to define either a 'sheet' name or a 'sheet_id' in your orchestration file when using gsheets."
-
-    # TODO merge this into gsheets processors manifest
-    @staticmethod
-    def configs() -> dict:
-        return {
-            "sheet": {"default": None, "help": "name of the sheet to archive"},
-            "sheet_id": {"default": None, "help": "(alternative to sheet name) the id of the sheet to archive"},
-            "header": {"default": 1, "help": "index of the header row (starts at 1)"},
-            "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path"},
-            "columns": {
-                "default": {
-                    'url': 'link',
-                    'status': 'archive status',
-                    'folder': 'destination folder',
-                    'archive': 'archive location',
-                    'date': 'archive date',
-                    'thumbnail': 'thumbnail',
-                    'timestamp': 'upload timestamp',
-                    'title': 'upload title',
-                    'text': 'text content',
-                    'screenshot': 'screenshot',
-                    'hash': 'hash',
-                    'pdq_hash': 'perceptual hashes',
-                    'wacz': 'wacz',
-                    'replaywebpage': 'replaywebpage',
-                },
-                "help": "names of columns in the google sheet (stringified JSON object)",
-                "cli_set": lambda cli_val, cur_val: dict(cur_val, **json.loads(cli_val))
-            },
-        }
-
-    def open_sheet(self):
-        if self.sheet:
-            return self.gsheets_client.open(self.sheet)
-        else:  # self.sheet_id
-            return self.gsheets_client.open_by_key(self.sheet_id)