From 15abf686b1315b3a35a628df12f687b9aec431d5 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Feb 2025 15:48:54 +0000 Subject: [PATCH] decouples s3_storage from hash_enricher --- src/auto_archiver/core/base_module.py | 2 +- .../modules/hash_enricher/hash_enricher.py | 8 ++------ src/auto_archiver/modules/s3_storage/s3_storage.py | 8 +++----- src/auto_archiver/utils/misc.py | 12 ++++++++++++ 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py index fcfe9ea..5c6ecbb 100644 --- a/src/auto_archiver/core/base_module.py +++ b/src/auto_archiver/core/base_module.py @@ -63,7 +63,7 @@ class BaseModule(ABC): def setup(self, config: dict): authentication = config.get('authentication', {}) - # extract out contatenated sites + # extract out concatenated sites for key, val in copy(authentication).items(): if "," in key: for site in key.split(","): diff --git a/src/auto_archiver/modules/hash_enricher/hash_enricher.py b/src/auto_archiver/modules/hash_enricher/hash_enricher.py index 58c6abe..b3ca8be 100644 --- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py +++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py @@ -12,6 +12,7 @@ from loguru import logger from auto_archiver.core import Enricher from auto_archiver.core import Metadata +from auto_archiver.utils.misc import calculate_file_hash class HashEnricher(Enricher): @@ -35,9 +36,4 @@ class HashEnricher(Enricher): elif self.algorithm == "SHA3-512": hash = hashlib.sha3_512() else: return "" - with open(filename, "rb") as f: - while True: - buf = f.read(self.chunksize) - if not buf: break - hash.update(buf) - return hash.hexdigest() + return calculate_file_hash(filename, hash, self.chunksize) diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index f324d5c..2f85164 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -7,12 +7,11 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage -from auto_archiver.modules.hash_enricher import HashEnricher -from auto_archiver.utils.misc import random_str +from auto_archiver.utils.misc import calculate_file_hash, random_str NO_DUPLICATES_FOLDER = "no-dups/" -class S3Storage(Storage, HashEnricher): +class S3Storage(Storage): def setup(self, config: dict) -> None: super().setup(config) @@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher): extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") - self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload - hd = self.calculate_hash(media.filename) + hd = calculate_file_hash(media.filename) path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): diff --git a/src/auto_archiver/utils/misc.py b/src/auto_archiver/utils/misc.py index 300a710..3af5a54 100644 --- a/src/auto_archiver/utils/misc.py +++ b/src/auto_archiver/utils/misc.py @@ -5,6 +5,7 @@ import json import uuid from datetime import datetime import requests +import hashlib from loguru import logger @@ -54,9 +55,20 @@ def update_nested_dict(dictionary, update_dict): else: dictionary[key] = value + def random_str(length: int = 32) -> str: assert length <= 32, "length must be less than 32 as UUID4 is used" return str(uuid.uuid4()).replace("-", "")[:length] + def json_loader(cli_val): return json.loads(cli_val) + + +def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str: + with open(filename, "rb") as f: + while True: + buf = f.read(chunksize) + if not buf: break + hash_algo.update(buf) + return hash_algo.hexdigest()