decouples s3_storage from hash_enricher

This commit is contained in:
msramalho
2025-02-10 15:48:54 +00:00
parent 8fb3dc754b
commit 15abf686b1
4 changed files with 18 additions and 12 deletions

View File

@@ -63,7 +63,7 @@ class BaseModule(ABC):
def setup(self, config: dict):
authentication = config.get('authentication', {})
# extract out contatenated sites
# extract out concatenated sites
for key, val in copy(authentication).items():
if "," in key:
for site in key.split(","):

View File

@@ -12,6 +12,7 @@ from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata
from auto_archiver.utils.misc import calculate_file_hash
class HashEnricher(Enricher):
@@ -35,9 +36,4 @@ class HashEnricher(Enricher):
elif self.algorithm == "SHA3-512":
hash = hashlib.sha3_512()
else: return ""
with open(filename, "rb") as f:
while True:
buf = f.read(self.chunksize)
if not buf: break
hash.update(buf)
return hash.hexdigest()
return calculate_file_hash(filename, hash, self.chunksize)

View File

@@ -7,12 +7,11 @@ from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.utils.misc import random_str
from auto_archiver.utils.misc import calculate_file_hash, random_str
NO_DUPLICATES_FOLDER = "no-dups/"
class S3Storage(Storage, HashEnricher):
class S3Storage(Storage):
def setup(self, config: dict) -> None:
super().setup(config)
@@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher):
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
hd = self.calculate_hash(media.filename)
hd = calculate_file_hash(media.filename)
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):

View File

@@ -5,6 +5,7 @@ import json
import uuid
from datetime import datetime
import requests
import hashlib
from loguru import logger
@@ -54,9 +55,20 @@ def update_nested_dict(dictionary, update_dict):
else:
dictionary[key] = value
def random_str(length: int = 32) -> str:
assert length <= 32, "length must be less than 32 as UUID4 is used"
return str(uuid.uuid4()).replace("-", "")[:length]
def json_loader(cli_val):
return json.loads(cli_val)
def calculate_file_hash(filename: str, hash_algo = hashlib.sha256(), chunksize: int = 16000000) -> str:
with open(filename, "rb") as f:
while True:
buf = f.read(chunksize)
if not buf: break
hash_algo.update(buf)
return hash_algo.hexdigest()