From e89a8da3b49670e380a2b01d03eba48090b575e0 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Mar 2025 11:30:15 +0000 Subject: [PATCH 1/5] Unit tests for storage types + fix storage too long issues for local storage --- src/auto_archiver/core/media.py | 4 +- src/auto_archiver/core/storage.py | 34 ++++++-- .../modules/local_storage/local_storage.py | 22 +++++ .../modules/s3_storage/s3_storage.py | 4 + tests/storages/test_local_storage.py | 9 ++- tests/storages/test_storage_base.py | 80 ++++++++++++++++++- 6 files changed, 142 insertions(+), 11 deletions(-) diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index b6820ab..0370c00 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -6,7 +6,7 @@ nested media retrieval, and type validation. from __future__ import annotations import os import traceback -from typing import Any, List +from typing import Any, List, Iterator from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes @@ -47,7 +47,7 @@ class Media: for any_media in self.all_inner_media(include_self=True): s.store(any_media, url, metadata=metadata) - def all_inner_media(self, include_self=False): + def all_inner_media(self, include_self=False) -> Iterator[Media]: """Retrieves all media, including nested media within properties or transformations on original media. This function returns a generator for all the inner media. diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 1535eab..2a14073 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -27,6 +27,7 @@ class Storage(BaseModule): if media.is_stored(in_storage=self): logger.debug(f"{media.key} already stored, skipping") return + self.set_key(media, url, metadata) self.upload(media, metadata=metadata) media.add_url(self.get_cdn_url(media)) @@ -50,34 +51,55 @@ class Storage(BaseModule): with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, url, metadata: Metadata) -> None: + def set_key(self, media: Media, url: str, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return folder = metadata.get_context('folder', '') filename, ext = os.path.splitext(media.filename) # Handle path_generator logic - path_generator = self.config.get("path_generator", "url") + path_generator = self.path_generator if path_generator == "flat": path = "" + # TODO: this is never used filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": path = slugify(url) elif path_generator == "random": - path = self.config.get("random_path", random_str(24), True) + path = random_str(24) else: raise ValueError(f"Invalid path_generator: {path_generator}") # Handle filename_generator logic - filename_generator = self.config.get("filename_generator", "random") + filename_generator = self.filename_generator if filename_generator == "random": filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module - he = self.module_factory.get_module(HashEnricher, self.config) + he = self.module_factory.get_module("hash_enricher", self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: raise ValueError(f"Invalid filename_generator: {filename_generator}") - media.key = os.path.join(folder, path, f"{filename}{ext}") + key = os.path.join(folder, path, f"{filename}{ext}") + if len(key) > self.max_file_length(): + # truncate the path + max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1 + path = path[:max_path_length] + logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters') + key = os.path.join(folder, path, f"{filename}{ext}") + + + media.key = key + + + def max_file_length(self) -> int: + """ + Returns the maximum length of a file name that can be stored in the storage service. + + Files are truncated if they exceed this length. + Override this method in subclasses if the storage service has a different maximum file length. + """ + return 255 # safe max file length for most filesystems (macOS, Windows, Linux) + diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index b995577..c07f682 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -10,6 +10,8 @@ from auto_archiver.core import Storage class LocalStorage(Storage): + MAX_FILE_LENGTH = 255 + def get_cdn_url(self, media: Media) -> str: # TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) @@ -20,11 +22,31 @@ class LocalStorage(Storage): def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata dest = os.path.join(self.save_to, media.key) + + if len(dest) > self.max_file_length(): + old_dest_length = len(dest) + filename, ext = os.path.splitext(media.key) + dir, filename = os.path.split(filename) + # see whether we should truncate filename or dir + if len(dir) > len(filename): + dir = dir[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] + else: + filename = filename[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] + + # override media.key + media.key = os.path.join(dir, f"{filename}{ext}") + dest = os.path.join(self.save_to, dir, f"{filename}{ext}") + logger.warning(f'Filename too long ({old_dest_length} characters), truncating to {len(dest)} characters') + os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') + res = shutil.copy2(media.filename, dest) logger.info(res) return True # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass + + def max_file_length(self): + return self.MAX_FILE_LENGTH \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 6590ac9..c8569b6 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -67,4 +67,8 @@ class S3Storage(Storage): if 'Contents' in resp: return resp['Contents'][0]['Key'] return False + + def max_file_length(self): + # Amazon AWS max file length is 1024, but we will use 1000 to be safe + return 1000 diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 7617867..7282e03 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -11,6 +11,7 @@ from auto_archiver.modules.local_storage import LocalStorage @pytest.fixture def local_storage(setup_module, tmp_path) -> LocalStorage: save_to = tmp_path / "local_archive" + save_to.mkdir() configs: dict = { "path_generator": "flat", "filename_generator": "static", @@ -19,7 +20,6 @@ def local_storage(setup_module, tmp_path) -> LocalStorage: } return setup_module("local_storage", configs) - @pytest.fixture def sample_media(tmp_path) -> Media: """Fixture creating a Media object with temporary source file""" @@ -27,6 +27,13 @@ def sample_media(tmp_path) -> Media: src_file.write_text("test content") return Media(key="subdir/test.txt", filename=str(src_file)) +def test_really_long_website_url_save(local_storage, tmp_path): + long_filename = os.path.join(local_storage.save_to, "file"*100 + ".txt") + src_file = tmp_path / "source.txt" + src_file.write_text("test content") + media = Media(key=long_filename, filename=str(src_file)) + assert local_storage.upload(media) is True + assert src_file.read_text() == Path(local_storage.get_cdn_url(media)).read_text() def test_get_cdn_url_relative(local_storage): media = Media(key="test.txt", filename="dummy.txt") diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 7578acd..fd07c32 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -2,9 +2,9 @@ from typing import Type import pytest -from auto_archiver.core.metadata import Metadata +from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.storage import Storage - +from auto_archiver.core.module import ModuleFactory class TestStorageBase(object): @@ -20,3 +20,79 @@ class TestStorageBase(object): self.storage: Type[Storage] = setup_module( self.module_name, self.config ) + + +class TestBaseStorage(Storage): + + name = "test_storage" + + def get_cdn_url(self, media): + return "cdn_url" + + def uploadf(self, file, key, **kwargs): + return True + +@pytest.fixture +def storage_base(): + def _storage_base(config): + storage_base = TestBaseStorage() + storage_base.config_setup({TestBaseStorage.name : config}) + storage_base.module_factory = ModuleFactory() + return storage_base + + return _storage_base + +@pytest.mark.parametrize( + "path_generator, filename_generator, url, expected_key", + [ + ("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"), + ("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"), + ("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"), + ("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"), + ("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"), + ("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"), + + ], +) +def test_storage_setup(storage_base, path_generator, filename_generator, url, expected_key, mocker): + mock_random = mocker.patch("auto_archiver.core.storage.random_str") + mock_random.return_value = "pretend-random" + + # create dummy.txt file + with open("dummy.txt", "w") as f: + f.write("test content") + + config: dict = { + "path_generator": path_generator, + "filename_generator": filename_generator, + } + storage: Storage = storage_base(config) + assert storage.path_generator == path_generator + assert storage.filename_generator == filename_generator + + metadata = Metadata() + metadata.set_context("folder", "folder") + media = Media(filename="dummy.txt") + storage.set_key(media, url, metadata) + print(media.key) + assert media.key == expected_key + + +def test_really_long_name(storage_base): + config: dict = { + "path_generator": "url", + "filename_generator": "static", + } + storage: Storage = storage_base(config) + + # create dummy.txt file + with open("dummy.txt", "w") as f: + f.write("test content") + + url = f"https://example.com/{'file'*100}" + media = Media(filename="dummy.txt") + storage.set_key(media, url, Metadata()) + assert len(media.key) <= storage.max_file_length() + assert media.key == "https-example-com-filefilefilefilefilefilefilefilefilefilefilefile\ +filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile\ +filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file From 770f4c8a3de0a3bd973dd099113bffb7b74a8cfc Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Mar 2025 16:39:31 +0000 Subject: [PATCH 2/5] Refactoring of storage code: 1. Fix some bugs in local_storage 2. Refactor unit tests to not set Media.key explicitly (unless it's well-known beforehand, which it isn't) 3. Limit length of URL for 'url' type path_generator 4. Throw an error if 'save_to' of local storage is too long 5. A few other tidyups --- src/auto_archiver/core/config.py | 1 - src/auto_archiver/core/consts.py | 2 + src/auto_archiver/core/extractor.py | 3 +- src/auto_archiver/core/media.py | 9 ++-- src/auto_archiver/core/orchestrator.py | 4 +- src/auto_archiver/core/storage.py | 42 ++++++++---------- .../modules/cli_feeder/cli_feeder.py | 1 - .../modules/local_storage/local_storage.py | 43 ++++++++----------- .../modules/s3_storage/s3_storage.py | 13 ++---- tests/enrichers/test_pdq_hash_enricher.py | 4 +- tests/storages/test_S3_storage.py | 8 ++-- tests/storages/test_atlos_storage.py | 2 +- tests/storages/test_gdrive_storage.py | 2 +- tests/storages/test_local_storage.py | 29 ++++++------- tests/storages/test_storage_base.py | 7 +-- 15 files changed, 74 insertions(+), 96 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 66d2ffb..0282d41 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -14,7 +14,6 @@ from loguru import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES -from typing import Any, List, Type, Tuple _yaml: YAML = YAML() diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py index a49884f..597ce4e 100644 --- a/src/auto_archiver/core/consts.py +++ b/src/auto_archiver/core/consts.py @@ -1,3 +1,5 @@ +class SetupError(ValueError): + pass MODULE_TYPES = [ 'feeder', diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 484a09d..f84be98 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -81,7 +81,8 @@ class Extractor(BaseModule): if len(to_filename) > 64: to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) - if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") + if verbose: + logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 0370c00..ecaef19 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -21,14 +21,13 @@ class Media: Represents a media file with associated properties and storage details. Attributes: - - filename: The file path of the media. - - key: An optional identifier for the media. + - filename: The file path of the media as saved locally (temporarily, before uploading to the storage). - urls: A list of URLs where the media is stored or accessible. - properties: Additional metadata or transformations for the media. - _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4). """ filename: str - key: str = None + _key: str = None urls: List[str] = field(default_factory=list) properties: dict = field(default_factory=dict) _mimetype: str = None # eg: image/jpeg @@ -67,6 +66,10 @@ class Media: # checks if the media is already stored in the given storage return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) + @property + def key(self) -> str: + return self._key + def set(self, key: str, value: Any) -> Media: self.properties[key] = value return self diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b78c7e7..6a95046 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher -from .consts import MODULE_TYPES +from .consts import MODULE_TYPES, SetupError from auto_archiver.utils.url import check_url_or_raise if TYPE_CHECKING: from .base_module import BaseModule from .module import LazyBaseModule -class SetupError(ValueError): - pass class ArchivingOrchestrator: # instance variables diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 2a14073..4867146 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -6,6 +6,7 @@ from __future__ import annotations from abc import abstractmethod from typing import IO import os +import platform from loguru import logger from slugify import slugify @@ -43,17 +44,30 @@ class Storage(BaseModule): def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: """ Uploads (or saves) a file to the storage service/location. + + This method should not be called directly, but instead through the 'store' method, + which sets up the media for storage. """ pass def upload(self, media: Media, **kwargs) -> bool: + """ + Uploads a media object to the storage service. + + This method should not be called directly, but instead be called through the 'store' method, + which sets up the media for storage. + """ logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) def set_key(self, media: Media, url: str, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" - if media.key is not None and len(media.key) > 0: return + + if media.key is not None and len(media.key) > 0: + # media key is already set + return + folder = metadata.get_context('folder', '') filename, ext = os.path.splitext(media.filename) @@ -61,10 +75,8 @@ class Storage(BaseModule): path_generator = self.path_generator if path_generator == "flat": path = "" - # TODO: this is never used - filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": - path = slugify(url) + path = slugify(url)[:70] elif path_generator == "random": path = random_str(24) else: @@ -81,25 +93,7 @@ class Storage(BaseModule): filename = hd[:24] else: raise ValueError(f"Invalid filename_generator: {filename_generator}") - + key = os.path.join(folder, path, f"{filename}{ext}") - if len(key) > self.max_file_length(): - # truncate the path - max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1 - path = path[:max_path_length] - logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters') - key = os.path.join(folder, path, f"{filename}{ext}") - - - media.key = key - - - def max_file_length(self) -> int: - """ - Returns the maximum length of a file name that can be stored in the storage service. - - Files are truncated if they exceed this length. - Override this method in subclasses if the storage service has a different maximum file length. - """ - return 255 # safe max file length for most filesystems (macOS, Windows, Linux) + media._key = key \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 20ca6ae..fac584d 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -15,7 +15,6 @@ class CLIFeeder(Feeder): for url in urls: logger.debug(f"Processing {url}") m = Metadata().set_url(url) - m.set_context("folder", "cli") yield m logger.success(f"Processed {len(urls)} URL(s)") \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index c07f682..6f1e217 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -6,37 +6,34 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage - +from auto_archiver.core.consts import SetupError class LocalStorage(Storage): - MAX_FILE_LENGTH = 255 + + def setup(self) -> None: + if len(self.save_to) > 200: + raise SetupError(f"Your save_to path is long, this will cause issues saving files on your computer. Please use a shorter path") def get_cdn_url(self, media: Media) -> str: - # TODO: is this viable with Storage.configs on path/filename? - dest = os.path.join(self.save_to, media.key) + dest = media.key + if self.save_absolute: dest = os.path.abspath(dest) return dest + def set_key(self, media, url, metadata): + # clarify we want to save the file to the save_to folder + + old_folder = metadata.get('folder', '') + metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', ''))) + super().set_key(media, url, metadata) + # don't impact other storages that might want a different 'folder' set + metadata.set_context('folder', old_folder) + def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata - dest = os.path.join(self.save_to, media.key) - - if len(dest) > self.max_file_length(): - old_dest_length = len(dest) - filename, ext = os.path.splitext(media.key) - dir, filename = os.path.split(filename) - # see whether we should truncate filename or dir - if len(dir) > len(filename): - dir = dir[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] - else: - filename = filename[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] - - # override media.key - media.key = os.path.join(dir, f"{filename}{ext}") - dest = os.path.join(self.save_to, dir, f"{filename}{ext}") - logger.warning(f'Filename too long ({old_dest_length} characters), truncating to {len(dest)} characters') + dest = media.key os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') @@ -46,7 +43,5 @@ class LocalStorage(Storage): return True # must be implemented even if unused - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass - - def max_file_length(self): - return self.MAX_FILE_LENGTH \ No newline at end of file + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: + pass \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index c8569b6..183a944 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -42,7 +42,7 @@ class S3Storage(Storage): logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True - + def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload @@ -50,13 +50,13 @@ class S3Storage(Storage): path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): - media.key = existing_key + media._key = existing_key media.set("previously archived", True) logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") return False _, ext = os.path.splitext(media.key) - media.key = os.path.join(path, f"{random_str(24)}{ext}") + media._key = os.path.join(path, f"{random_str(24)}{ext}") return True def file_in_folder(self, path:str) -> str: @@ -66,9 +66,4 @@ class S3Storage(Storage): resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1) if 'Contents' in resp: return resp['Contents'][0]['Key'] - return False - - def max_file_length(self): - # Amazon AWS max file length is 1024, but we will use 1000 to be safe - return 1000 - + return False \ No newline at end of file diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index a8470fb..61fa778 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -14,8 +14,8 @@ def enricher(setup_module): def metadata_with_images(): m = Metadata() m.set_url("https://example.com") - m.add_media(Media(filename="image1.jpg", key="image1")) - m.add_media(Media(filename="image2.jpg", key="image2")) + m.add_media(Media(filename="image1.jpg", _key="image1")) + m.add_media(Media(filename="image2.jpg", _key="image2")) return m diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index fe60329..abf9763 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -37,10 +37,10 @@ class TestS3Storage: def test_get_cdn_url_generation(self): """Test CDN URL formatting """ media = Media("test.txt") - media.key = "path/to/file.txt" + media._key = "path/to/file.txt" url = self.storage.get_cdn_url(media) assert url == "https://cdn.example.com/path/to/file.txt" - media.key = "another/path.jpg" + media._key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" def test_uploadf_sets_acl_public(self, mocker): @@ -72,7 +72,7 @@ class TestS3Storage: self.storage.random_no_duplicate = True mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") media = Media("test.txt") - media.key = "original_path.txt" + media._key = "original_path.txt" mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" @@ -84,7 +84,7 @@ class TestS3Storage: def test_uploads_with_correct_parameters(self, mocker): media = Media("test.txt") - media.key = "original_key.txt" + media._key = "original_key.txt" mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) media.mimetype = 'image/png' mock_file = mocker.MagicMock() diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py index bcd8f18..d33ff2f 100644 --- a/tests/storages/test_atlos_storage.py +++ b/tests/storages/test_atlos_storage.py @@ -44,7 +44,7 @@ def media(tmp_path) -> Media: file_path.write_bytes(content) media = Media(filename=str(file_path)) media.properties = {"something": "Title"} - media.key = "key" + media._key = "key" return media diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index f5ff87c..d87f5e8 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -50,7 +50,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker): def test_path_parts(): media = Media(filename="test.jpg") - media.key = "folder1/folder2/test.jpg" + media._key = "folder1/folder2/test.jpg" diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 7282e03..1fd6774 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -4,9 +4,9 @@ from pathlib import Path import pytest -from auto_archiver.core import Media +from auto_archiver.core import Media, Metadata from auto_archiver.modules.local_storage import LocalStorage - +from auto_archiver.core.consts import SetupError @pytest.fixture def local_storage(setup_module, tmp_path) -> LocalStorage: @@ -25,37 +25,32 @@ def sample_media(tmp_path) -> Media: """Fixture creating a Media object with temporary source file""" src_file = tmp_path / "source.txt" src_file.write_text("test content") - return Media(key="subdir/test.txt", filename=str(src_file)) + return Media(filename=str(src_file)) -def test_really_long_website_url_save(local_storage, tmp_path): - long_filename = os.path.join(local_storage.save_to, "file"*100 + ".txt") - src_file = tmp_path / "source.txt" - src_file.write_text("test content") - media = Media(key=long_filename, filename=str(src_file)) - assert local_storage.upload(media) is True - assert src_file.read_text() == Path(local_storage.get_cdn_url(media)).read_text() +def test_too_long_save_path(setup_module): + with pytest.raises(SetupError): + setup_module("local_storage", {"save_to": "long"*100}) def test_get_cdn_url_relative(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + media = Media(filename="dummy.txt") + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.join(local_storage.save_to, media.key) assert local_storage.get_cdn_url(media) == expected - - def test_get_cdn_url_absolute(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + media = Media(filename="dummy.txt") local_storage.save_absolute = True + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) assert local_storage.get_cdn_url(media) == expected def test_upload_file_contents_and_metadata(local_storage, sample_media): + local_storage.store(sample_media, "https://example.com", Metadata()) dest = os.path.join(local_storage.save_to, sample_media.key) - assert local_storage.upload(sample_media) is True assert Path(sample_media.filename).read_text() == Path(dest).read_text() - def test_upload_nonexistent_source(local_storage): - media = Media(key="missing.txt", filename="nonexistent.txt") + media = Media(_key="missing.txt", filename="nonexistent.txt") with pytest.raises(FileNotFoundError): local_storage.upload(media) diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index fd07c32..30a91b9 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -54,7 +54,7 @@ def storage_base(): ], ) -def test_storage_setup(storage_base, path_generator, filename_generator, url, expected_key, mocker): +def test_storage_name_generation(storage_base, path_generator, filename_generator, url, expected_key, mocker): mock_random = mocker.patch("auto_archiver.core.storage.random_str") mock_random.return_value = "pretend-random" @@ -92,7 +92,4 @@ def test_really_long_name(storage_base): url = f"https://example.com/{'file'*100}" media = Media(filename="dummy.txt") storage.set_key(media, url, Metadata()) - assert len(media.key) <= storage.max_file_length() - assert media.key == "https-example-com-filefilefilefilefilefilefilefilefilefilefilefile\ -filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile\ -filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file + assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file From a9c34772896e1971e74005cfb9bb95b7ef39dd92 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Mar 2025 16:43:14 +0000 Subject: [PATCH 3/5] Improve docs on the path_generator and filename_generator config options --- src/auto_archiver/core/storage.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 4867146..63ccf8d 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -1,5 +1,22 @@ """ Base module for Storage modules – modular components that store media objects in various locations. + +If you are looking to implement a new storage module, you should subclass the `Storage` class and +implement the `get_cdn_url` and `uploadf` methods. + +Your module **must** also have two config variables 'path_generator' and 'filename_generator' which +determine how the key is generated for the media object. The 'path_generator' and 'filename_generator' +variables can be set to one of the following values: +- 'flat': A flat structure with no subfolders +- 'url': A structure based on the URL of the media object +- 'random': A random structure + +The 'filename_generator' variable can be set to one of the following values: +- 'random': A random string +- 'static': A replicable strategy such as a hash + +If you don't want to use this naming convention, you can override the `set_key` method in your subclass. + """ from __future__ import annotations From 2b91dc95146a860e55f9b9003d0969527431f473 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 10 Mar 2025 16:51:16 +0000 Subject: [PATCH 4/5] Fix up unit tests --- tests/storages/test_local_storage.py | 3 +++ tests/storages/test_storage_base.py | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 1fd6774..c3581df 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -32,12 +32,15 @@ def test_too_long_save_path(setup_module): setup_module("local_storage", {"save_to": "long"*100}) def test_get_cdn_url_relative(local_storage): + local_storage.filename_generator = "random" media = Media(filename="dummy.txt") local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.join(local_storage.save_to, media.key) assert local_storage.get_cdn_url(media) == expected def test_get_cdn_url_absolute(local_storage): + local_storage.filename_generator = "random" + media = Media(filename="dummy.txt") local_storage.save_absolute = True local_storage.set_key(media, "https://example.com", Metadata()) diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 30a91b9..53dfbd7 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -32,6 +32,13 @@ class TestBaseStorage(Storage): def uploadf(self, file, key, **kwargs): return True +@pytest.fixture +def dummy_file(tmp_path): + # create dummy.txt file + dummy_file = tmp_path / "dummy.txt" + dummy_file.write_text("test content") + return str(dummy_file) + @pytest.fixture def storage_base(): def _storage_base(config): @@ -54,14 +61,11 @@ def storage_base(): ], ) -def test_storage_name_generation(storage_base, path_generator, filename_generator, url, expected_key, mocker): +def test_storage_name_generation(storage_base, path_generator, filename_generator, url, + expected_key, mocker, tmp_path, dummy_file): mock_random = mocker.patch("auto_archiver.core.storage.random_str") mock_random.return_value = "pretend-random" - # create dummy.txt file - with open("dummy.txt", "w") as f: - f.write("test content") - config: dict = { "path_generator": path_generator, "filename_generator": filename_generator, @@ -72,24 +76,20 @@ def test_storage_name_generation(storage_base, path_generator, filename_generato metadata = Metadata() metadata.set_context("folder", "folder") - media = Media(filename="dummy.txt") + media = Media(filename=dummy_file) storage.set_key(media, url, metadata) print(media.key) assert media.key == expected_key -def test_really_long_name(storage_base): +def test_really_long_name(storage_base, dummy_file): config: dict = { "path_generator": "url", "filename_generator": "static", } storage: Storage = storage_base(config) - # create dummy.txt file - with open("dummy.txt", "w") as f: - f.write("test content") - url = f"https://example.com/{'file'*100}" - media = Media(filename="dummy.txt") + media = Media(filename=dummy_file) storage.set_key(media, url, Metadata()) assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file From 3fcec57492b9383535b1bc5960ba9e3d40d1e030 Mon Sep 17 00:00:00 2001 From: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com> Date: Mon, 10 Mar 2025 17:17:59 +0000 Subject: [PATCH 5/5] minor string fix --- src/auto_archiver/modules/local_storage/local_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index 6f1e217..2b1a101 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -13,7 +13,7 @@ class LocalStorage(Storage): def setup(self) -> None: if len(self.save_to) > 200: - raise SetupError(f"Your save_to path is long, this will cause issues saving files on your computer. Please use a shorter path") + raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.") def get_cdn_url(self, media: Media) -> str: dest = media.key