diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 66d2ffb..0282d41 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -14,7 +14,6 @@ from loguru import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES -from typing import Any, List, Type, Tuple _yaml: YAML = YAML() diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py index a49884f..597ce4e 100644 --- a/src/auto_archiver/core/consts.py +++ b/src/auto_archiver/core/consts.py @@ -1,3 +1,5 @@ +class SetupError(ValueError): + pass MODULE_TYPES = [ 'feeder', diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 484a09d..f84be98 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -81,7 +81,8 @@ class Extractor(BaseModule): if len(to_filename) > 64: to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) - if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") + if verbose: + logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index b6820ab..ecaef19 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -6,7 +6,7 @@ nested media retrieval, and type validation. from __future__ import annotations import os import traceback -from typing import Any, List +from typing import Any, List, Iterator from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes @@ -21,14 +21,13 @@ class Media: Represents a media file with associated properties and storage details. Attributes: - - filename: The file path of the media. - - key: An optional identifier for the media. + - filename: The file path of the media as saved locally (temporarily, before uploading to the storage). - urls: A list of URLs where the media is stored or accessible. - properties: Additional metadata or transformations for the media. - _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4). """ filename: str - key: str = None + _key: str = None urls: List[str] = field(default_factory=list) properties: dict = field(default_factory=dict) _mimetype: str = None # eg: image/jpeg @@ -47,7 +46,7 @@ class Media: for any_media in self.all_inner_media(include_self=True): s.store(any_media, url, metadata=metadata) - def all_inner_media(self, include_self=False): + def all_inner_media(self, include_self=False) -> Iterator[Media]: """Retrieves all media, including nested media within properties or transformations on original media. This function returns a generator for all the inner media. @@ -67,6 +66,10 @@ class Media: # checks if the media is already stored in the given storage return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) + @property + def key(self) -> str: + return self._key + def set(self, key: str, value: Any) -> Media: self.properties[key] = value return self diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b78c7e7..6a95046 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher -from .consts import MODULE_TYPES +from .consts import MODULE_TYPES, SetupError from auto_archiver.utils.url import check_url_or_raise if TYPE_CHECKING: from .base_module import BaseModule from .module import LazyBaseModule -class SetupError(ValueError): - pass class ArchivingOrchestrator: # instance variables diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 1535eab..63ccf8d 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -1,11 +1,29 @@ """ Base module for Storage modules – modular components that store media objects in various locations. + +If you are looking to implement a new storage module, you should subclass the `Storage` class and +implement the `get_cdn_url` and `uploadf` methods. + +Your module **must** also have two config variables 'path_generator' and 'filename_generator' which +determine how the key is generated for the media object. The 'path_generator' and 'filename_generator' +variables can be set to one of the following values: +- 'flat': A flat structure with no subfolders +- 'url': A structure based on the URL of the media object +- 'random': A random structure + +The 'filename_generator' variable can be set to one of the following values: +- 'random': A random string +- 'static': A replicable strategy such as a hash + +If you don't want to use this naming convention, you can override the `set_key` method in your subclass. + """ from __future__ import annotations from abc import abstractmethod from typing import IO import os +import platform from loguru import logger from slugify import slugify @@ -27,6 +45,7 @@ class Storage(BaseModule): if media.is_stored(in_storage=self): logger.debug(f"{media.key} already stored, skipping") return + self.set_key(media, url, metadata) self.upload(media, metadata=metadata) media.add_url(self.get_cdn_url(media)) @@ -42,42 +61,56 @@ class Storage(BaseModule): def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: """ Uploads (or saves) a file to the storage service/location. + + This method should not be called directly, but instead through the 'store' method, + which sets up the media for storage. """ pass def upload(self, media: Media, **kwargs) -> bool: + """ + Uploads a media object to the storage service. + + This method should not be called directly, but instead be called through the 'store' method, + which sets up the media for storage. + """ logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, url, metadata: Metadata) -> None: + def set_key(self, media: Media, url: str, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" - if media.key is not None and len(media.key) > 0: return + + if media.key is not None and len(media.key) > 0: + # media key is already set + return + folder = metadata.get_context('folder', '') filename, ext = os.path.splitext(media.filename) # Handle path_generator logic - path_generator = self.config.get("path_generator", "url") + path_generator = self.path_generator if path_generator == "flat": path = "" - filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": - path = slugify(url) + path = slugify(url)[:70] elif path_generator == "random": - path = self.config.get("random_path", random_str(24), True) + path = random_str(24) else: raise ValueError(f"Invalid path_generator: {path_generator}") # Handle filename_generator logic - filename_generator = self.config.get("filename_generator", "random") + filename_generator = self.filename_generator if filename_generator == "random": filename = random_str(24) elif filename_generator == "static": # load the hash_enricher module - he = self.module_factory.get_module(HashEnricher, self.config) + he = self.module_factory.get_module("hash_enricher", self.config) hd = he.calculate_hash(media.filename) filename = hd[:24] else: raise ValueError(f"Invalid filename_generator: {filename_generator}") + + key = os.path.join(folder, path, f"{filename}{ext}") - media.key = os.path.join(folder, path, f"{filename}{ext}") + media._key = key \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 20ca6ae..fac584d 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -15,7 +15,6 @@ class CLIFeeder(Feeder): for url in urls: logger.debug(f"Processing {url}") m = Metadata().set_url(url) - m.set_context("folder", "cli") yield m logger.success(f"Processed {len(urls)} URL(s)") \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index b995577..2b1a101 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -6,25 +6,42 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage - +from auto_archiver.core.consts import SetupError class LocalStorage(Storage): + + def setup(self) -> None: + if len(self.save_to) > 200: + raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.") + def get_cdn_url(self, media: Media) -> str: - # TODO: is this viable with Storage.configs on path/filename? - dest = os.path.join(self.save_to, media.key) + dest = media.key + if self.save_absolute: dest = os.path.abspath(dest) return dest + def set_key(self, media, url, metadata): + # clarify we want to save the file to the save_to folder + + old_folder = metadata.get('folder', '') + metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', ''))) + super().set_key(media, url, metadata) + # don't impact other storages that might want a different 'folder' set + metadata.set_context('folder', old_folder) + def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata - dest = os.path.join(self.save_to, media.key) + dest = media.key + os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') + res = shutil.copy2(media.filename, dest) logger.info(res) return True # must be implemented even if unused - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: + pass \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index 6590ac9..183a944 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -42,7 +42,7 @@ class S3Storage(Storage): logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True - + def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload @@ -50,13 +50,13 @@ class S3Storage(Storage): path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): - media.key = existing_key + media._key = existing_key media.set("previously archived", True) logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") return False _, ext = os.path.splitext(media.key) - media.key = os.path.join(path, f"{random_str(24)}{ext}") + media._key = os.path.join(path, f"{random_str(24)}{ext}") return True def file_in_folder(self, path:str) -> str: @@ -66,5 +66,4 @@ class S3Storage(Storage): resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1) if 'Contents' in resp: return resp['Contents'][0]['Key'] - return False - + return False \ No newline at end of file diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index a8470fb..61fa778 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -14,8 +14,8 @@ def enricher(setup_module): def metadata_with_images(): m = Metadata() m.set_url("https://example.com") - m.add_media(Media(filename="image1.jpg", key="image1")) - m.add_media(Media(filename="image2.jpg", key="image2")) + m.add_media(Media(filename="image1.jpg", _key="image1")) + m.add_media(Media(filename="image2.jpg", _key="image2")) return m diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index fe60329..abf9763 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -37,10 +37,10 @@ class TestS3Storage: def test_get_cdn_url_generation(self): """Test CDN URL formatting """ media = Media("test.txt") - media.key = "path/to/file.txt" + media._key = "path/to/file.txt" url = self.storage.get_cdn_url(media) assert url == "https://cdn.example.com/path/to/file.txt" - media.key = "another/path.jpg" + media._key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" def test_uploadf_sets_acl_public(self, mocker): @@ -72,7 +72,7 @@ class TestS3Storage: self.storage.random_no_duplicate = True mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") media = Media("test.txt") - media.key = "original_path.txt" + media._key = "original_path.txt" mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" @@ -84,7 +84,7 @@ class TestS3Storage: def test_uploads_with_correct_parameters(self, mocker): media = Media("test.txt") - media.key = "original_key.txt" + media._key = "original_key.txt" mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) media.mimetype = 'image/png' mock_file = mocker.MagicMock() diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py index bcd8f18..d33ff2f 100644 --- a/tests/storages/test_atlos_storage.py +++ b/tests/storages/test_atlos_storage.py @@ -44,7 +44,7 @@ def media(tmp_path) -> Media: file_path.write_bytes(content) media = Media(filename=str(file_path)) media.properties = {"something": "Title"} - media.key = "key" + media._key = "key" return media diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index f5ff87c..d87f5e8 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -50,7 +50,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker): def test_path_parts(): media = Media(filename="test.jpg") - media.key = "folder1/folder2/test.jpg" + media._key = "folder1/folder2/test.jpg" diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 7617867..c3581df 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -4,13 +4,14 @@ from pathlib import Path import pytest -from auto_archiver.core import Media +from auto_archiver.core import Media, Metadata from auto_archiver.modules.local_storage import LocalStorage - +from auto_archiver.core.consts import SetupError @pytest.fixture def local_storage(setup_module, tmp_path) -> LocalStorage: save_to = tmp_path / "local_archive" + save_to.mkdir() configs: dict = { "path_generator": "flat", "filename_generator": "static", @@ -19,36 +20,40 @@ def local_storage(setup_module, tmp_path) -> LocalStorage: } return setup_module("local_storage", configs) - @pytest.fixture def sample_media(tmp_path) -> Media: """Fixture creating a Media object with temporary source file""" src_file = tmp_path / "source.txt" src_file.write_text("test content") - return Media(key="subdir/test.txt", filename=str(src_file)) + return Media(filename=str(src_file)) +def test_too_long_save_path(setup_module): + with pytest.raises(SetupError): + setup_module("local_storage", {"save_to": "long"*100}) def test_get_cdn_url_relative(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + local_storage.filename_generator = "random" + media = Media(filename="dummy.txt") + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.join(local_storage.save_to, media.key) assert local_storage.get_cdn_url(media) == expected - - def test_get_cdn_url_absolute(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + local_storage.filename_generator = "random" + + media = Media(filename="dummy.txt") local_storage.save_absolute = True + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) assert local_storage.get_cdn_url(media) == expected def test_upload_file_contents_and_metadata(local_storage, sample_media): + local_storage.store(sample_media, "https://example.com", Metadata()) dest = os.path.join(local_storage.save_to, sample_media.key) - assert local_storage.upload(sample_media) is True assert Path(sample_media.filename).read_text() == Path(dest).read_text() - def test_upload_nonexistent_source(local_storage): - media = Media(key="missing.txt", filename="nonexistent.txt") + media = Media(_key="missing.txt", filename="nonexistent.txt") with pytest.raises(FileNotFoundError): local_storage.upload(media) diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index 7578acd..53dfbd7 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -2,9 +2,9 @@ from typing import Type import pytest -from auto_archiver.core.metadata import Metadata +from auto_archiver.core.metadata import Metadata, Media from auto_archiver.core.storage import Storage - +from auto_archiver.core.module import ModuleFactory class TestStorageBase(object): @@ -20,3 +20,76 @@ class TestStorageBase(object): self.storage: Type[Storage] = setup_module( self.module_name, self.config ) + + +class TestBaseStorage(Storage): + + name = "test_storage" + + def get_cdn_url(self, media): + return "cdn_url" + + def uploadf(self, file, key, **kwargs): + return True + +@pytest.fixture +def dummy_file(tmp_path): + # create dummy.txt file + dummy_file = tmp_path / "dummy.txt" + dummy_file.write_text("test content") + return str(dummy_file) + +@pytest.fixture +def storage_base(): + def _storage_base(config): + storage_base = TestBaseStorage() + storage_base.config_setup({TestBaseStorage.name : config}) + storage_base.module_factory = ModuleFactory() + return storage_base + + return _storage_base + +@pytest.mark.parametrize( + "path_generator, filename_generator, url, expected_key", + [ + ("flat", "static", "https://example.com/file/", "folder/6ae8a75555209fd6c44157c0.txt"), + ("flat", "random", "https://example.com/file/", "folder/pretend-random.txt"), + ("url", "static", "https://example.com/file/", "folder/https-example-com-file/6ae8a75555209fd6c44157c0.txt"), + ("url", "random", "https://example.com/file/", "folder/https-example-com-file/pretend-random.txt"), + ("random", "static", "https://example.com/file/", "folder/pretend-random/6ae8a75555209fd6c44157c0.txt"), + ("random", "random", "https://example.com/file/", "folder/pretend-random/pretend-random.txt"), + + ], +) +def test_storage_name_generation(storage_base, path_generator, filename_generator, url, + expected_key, mocker, tmp_path, dummy_file): + mock_random = mocker.patch("auto_archiver.core.storage.random_str") + mock_random.return_value = "pretend-random" + + config: dict = { + "path_generator": path_generator, + "filename_generator": filename_generator, + } + storage: Storage = storage_base(config) + assert storage.path_generator == path_generator + assert storage.filename_generator == filename_generator + + metadata = Metadata() + metadata.set_context("folder", "folder") + media = Media(filename=dummy_file) + storage.set_key(media, url, metadata) + print(media.key) + assert media.key == expected_key + + +def test_really_long_name(storage_base, dummy_file): + config: dict = { + "path_generator": "url", + "filename_generator": "static", + } + storage: Storage = storage_base(config) + + url = f"https://example.com/{'file'*100}" + media = Media(filename=dummy_file) + storage.set_key(media, url, Metadata()) + assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file