diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 66d2ffb..0282d41 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -14,7 +14,6 @@ from loguru import logger from copy import deepcopy from auto_archiver.core.consts import MODULE_TYPES -from typing import Any, List, Type, Tuple _yaml: YAML = YAML() diff --git a/src/auto_archiver/core/consts.py b/src/auto_archiver/core/consts.py index a49884f..597ce4e 100644 --- a/src/auto_archiver/core/consts.py +++ b/src/auto_archiver/core/consts.py @@ -1,3 +1,5 @@ +class SetupError(ValueError): + pass MODULE_TYPES = [ 'feeder', diff --git a/src/auto_archiver/core/extractor.py b/src/auto_archiver/core/extractor.py index 484a09d..f84be98 100644 --- a/src/auto_archiver/core/extractor.py +++ b/src/auto_archiver/core/extractor.py @@ -81,7 +81,8 @@ class Extractor(BaseModule): if len(to_filename) > 64: to_filename = to_filename[-64:] to_filename = os.path.join(self.tmp_dir, to_filename) - if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}") + if verbose: + logger.debug(f"downloading {url[0:50]=} {to_filename=}") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36' } diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 0370c00..ecaef19 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -21,14 +21,13 @@ class Media: Represents a media file with associated properties and storage details. Attributes: - - filename: The file path of the media. - - key: An optional identifier for the media. + - filename: The file path of the media as saved locally (temporarily, before uploading to the storage). - urls: A list of URLs where the media is stored or accessible. - properties: Additional metadata or transformations for the media. - _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4). """ filename: str - key: str = None + _key: str = None urls: List[str] = field(default_factory=list) properties: dict = field(default_factory=dict) _mimetype: str = None # eg: image/jpeg @@ -67,6 +66,10 @@ class Media: # checks if the media is already stored in the given storage return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"]) + @property + def key(self) -> str: + return self._key + def set(self, key: str, value: Any) -> Media: self.properties[key] = value return self diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index b78c7e7..6a95046 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE from .module import ModuleFactory, LazyBaseModule from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher -from .consts import MODULE_TYPES +from .consts import MODULE_TYPES, SetupError from auto_archiver.utils.url import check_url_or_raise if TYPE_CHECKING: from .base_module import BaseModule from .module import LazyBaseModule -class SetupError(ValueError): - pass class ArchivingOrchestrator: # instance variables diff --git a/src/auto_archiver/core/storage.py b/src/auto_archiver/core/storage.py index 2a14073..4867146 100644 --- a/src/auto_archiver/core/storage.py +++ b/src/auto_archiver/core/storage.py @@ -6,6 +6,7 @@ from __future__ import annotations from abc import abstractmethod from typing import IO import os +import platform from loguru import logger from slugify import slugify @@ -43,17 +44,30 @@ class Storage(BaseModule): def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: """ Uploads (or saves) a file to the storage service/location. + + This method should not be called directly, but instead through the 'store' method, + which sets up the media for storage. """ pass def upload(self, media: Media, **kwargs) -> bool: + """ + Uploads a media object to the storage service. + + This method should not be called directly, but instead be called through the 'store' method, + which sets up the media for storage. + """ logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) def set_key(self, media: Media, url: str, metadata: Metadata) -> None: """takes the media and optionally item info and generates a key""" - if media.key is not None and len(media.key) > 0: return + + if media.key is not None and len(media.key) > 0: + # media key is already set + return + folder = metadata.get_context('folder', '') filename, ext = os.path.splitext(media.filename) @@ -61,10 +75,8 @@ class Storage(BaseModule): path_generator = self.path_generator if path_generator == "flat": path = "" - # TODO: this is never used - filename = slugify(filename) # Ensure filename is slugified elif path_generator == "url": - path = slugify(url) + path = slugify(url)[:70] elif path_generator == "random": path = random_str(24) else: @@ -81,25 +93,7 @@ class Storage(BaseModule): filename = hd[:24] else: raise ValueError(f"Invalid filename_generator: {filename_generator}") - + key = os.path.join(folder, path, f"{filename}{ext}") - if len(key) > self.max_file_length(): - # truncate the path - max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1 - path = path[:max_path_length] - logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters') - key = os.path.join(folder, path, f"{filename}{ext}") - - - media.key = key - - - def max_file_length(self) -> int: - """ - Returns the maximum length of a file name that can be stored in the storage service. - - Files are truncated if they exceed this length. - Override this method in subclasses if the storage service has a different maximum file length. - """ - return 255 # safe max file length for most filesystems (macOS, Windows, Linux) + media._key = key \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index 20ca6ae..fac584d 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -15,7 +15,6 @@ class CLIFeeder(Feeder): for url in urls: logger.debug(f"Processing {url}") m = Metadata().set_url(url) - m.set_context("folder", "cli") yield m logger.success(f"Processed {len(urls)} URL(s)") \ No newline at end of file diff --git a/src/auto_archiver/modules/local_storage/local_storage.py b/src/auto_archiver/modules/local_storage/local_storage.py index c07f682..6f1e217 100644 --- a/src/auto_archiver/modules/local_storage/local_storage.py +++ b/src/auto_archiver/modules/local_storage/local_storage.py @@ -6,37 +6,34 @@ from loguru import logger from auto_archiver.core import Media from auto_archiver.core import Storage - +from auto_archiver.core.consts import SetupError class LocalStorage(Storage): - MAX_FILE_LENGTH = 255 + + def setup(self) -> None: + if len(self.save_to) > 200: + raise SetupError(f"Your save_to path is long, this will cause issues saving files on your computer. Please use a shorter path") def get_cdn_url(self, media: Media) -> str: - # TODO: is this viable with Storage.configs on path/filename? - dest = os.path.join(self.save_to, media.key) + dest = media.key + if self.save_absolute: dest = os.path.abspath(dest) return dest + def set_key(self, media, url, metadata): + # clarify we want to save the file to the save_to folder + + old_folder = metadata.get('folder', '') + metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', ''))) + super().set_key(media, url, metadata) + # don't impact other storages that might want a different 'folder' set + metadata.set_context('folder', old_folder) + def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata - dest = os.path.join(self.save_to, media.key) - - if len(dest) > self.max_file_length(): - old_dest_length = len(dest) - filename, ext = os.path.splitext(media.key) - dir, filename = os.path.split(filename) - # see whether we should truncate filename or dir - if len(dir) > len(filename): - dir = dir[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] - else: - filename = filename[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1] - - # override media.key - media.key = os.path.join(dir, f"{filename}{ext}") - dest = os.path.join(self.save_to, dir, f"{filename}{ext}") - logger.warning(f'Filename too long ({old_dest_length} characters), truncating to {len(dest)} characters') + dest = media.key os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}') @@ -46,7 +43,5 @@ class LocalStorage(Storage): return True # must be implemented even if unused - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass - - def max_file_length(self): - return self.MAX_FILE_LENGTH \ No newline at end of file + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: + pass \ No newline at end of file diff --git a/src/auto_archiver/modules/s3_storage/s3_storage.py b/src/auto_archiver/modules/s3_storage/s3_storage.py index c8569b6..183a944 100644 --- a/src/auto_archiver/modules/s3_storage/s3_storage.py +++ b/src/auto_archiver/modules/s3_storage/s3_storage.py @@ -42,7 +42,7 @@ class S3Storage(Storage): logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) return True - + def is_upload_needed(self, media: Media) -> bool: if self.random_no_duplicate: # checks if a folder with the hash already exists, if so it skips the upload @@ -50,13 +50,13 @@ class S3Storage(Storage): path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24]) if existing_key:=self.file_in_folder(path): - media.key = existing_key + media._key = existing_key media.set("previously archived", True) logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}") return False _, ext = os.path.splitext(media.key) - media.key = os.path.join(path, f"{random_str(24)}{ext}") + media._key = os.path.join(path, f"{random_str(24)}{ext}") return True def file_in_folder(self, path:str) -> str: @@ -66,9 +66,4 @@ class S3Storage(Storage): resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1) if 'Contents' in resp: return resp['Contents'][0]['Key'] - return False - - def max_file_length(self): - # Amazon AWS max file length is 1024, but we will use 1000 to be safe - return 1000 - + return False \ No newline at end of file diff --git a/tests/enrichers/test_pdq_hash_enricher.py b/tests/enrichers/test_pdq_hash_enricher.py index a8470fb..61fa778 100644 --- a/tests/enrichers/test_pdq_hash_enricher.py +++ b/tests/enrichers/test_pdq_hash_enricher.py @@ -14,8 +14,8 @@ def enricher(setup_module): def metadata_with_images(): m = Metadata() m.set_url("https://example.com") - m.add_media(Media(filename="image1.jpg", key="image1")) - m.add_media(Media(filename="image2.jpg", key="image2")) + m.add_media(Media(filename="image1.jpg", _key="image1")) + m.add_media(Media(filename="image2.jpg", _key="image2")) return m diff --git a/tests/storages/test_S3_storage.py b/tests/storages/test_S3_storage.py index fe60329..abf9763 100644 --- a/tests/storages/test_S3_storage.py +++ b/tests/storages/test_S3_storage.py @@ -37,10 +37,10 @@ class TestS3Storage: def test_get_cdn_url_generation(self): """Test CDN URL formatting """ media = Media("test.txt") - media.key = "path/to/file.txt" + media._key = "path/to/file.txt" url = self.storage.get_cdn_url(media) assert url == "https://cdn.example.com/path/to/file.txt" - media.key = "another/path.jpg" + media._key = "another/path.jpg" assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg" def test_uploadf_sets_acl_public(self, mocker): @@ -72,7 +72,7 @@ class TestS3Storage: self.storage.random_no_duplicate = True mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt") media = Media("test.txt") - media.key = "original_path.txt" + media._key = "original_path.txt" mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123") assert self.storage.is_upload_needed(media) is False assert media.key == "existing_folder/existing_file.txt" @@ -84,7 +84,7 @@ class TestS3Storage: def test_uploads_with_correct_parameters(self, mocker): media = Media("test.txt") - media.key = "original_key.txt" + media._key = "original_key.txt" mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True) media.mimetype = 'image/png' mock_file = mocker.MagicMock() diff --git a/tests/storages/test_atlos_storage.py b/tests/storages/test_atlos_storage.py index bcd8f18..d33ff2f 100644 --- a/tests/storages/test_atlos_storage.py +++ b/tests/storages/test_atlos_storage.py @@ -44,7 +44,7 @@ def media(tmp_path) -> Media: file_path.write_bytes(content) media = Media(filename=str(file_path)) media.properties = {"something": "Title"} - media.key = "key" + media._key = "key" return media diff --git a/tests/storages/test_gdrive_storage.py b/tests/storages/test_gdrive_storage.py index f5ff87c..d87f5e8 100644 --- a/tests/storages/test_gdrive_storage.py +++ b/tests/storages/test_gdrive_storage.py @@ -50,7 +50,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker): def test_path_parts(): media = Media(filename="test.jpg") - media.key = "folder1/folder2/test.jpg" + media._key = "folder1/folder2/test.jpg" diff --git a/tests/storages/test_local_storage.py b/tests/storages/test_local_storage.py index 7282e03..1fd6774 100644 --- a/tests/storages/test_local_storage.py +++ b/tests/storages/test_local_storage.py @@ -4,9 +4,9 @@ from pathlib import Path import pytest -from auto_archiver.core import Media +from auto_archiver.core import Media, Metadata from auto_archiver.modules.local_storage import LocalStorage - +from auto_archiver.core.consts import SetupError @pytest.fixture def local_storage(setup_module, tmp_path) -> LocalStorage: @@ -25,37 +25,32 @@ def sample_media(tmp_path) -> Media: """Fixture creating a Media object with temporary source file""" src_file = tmp_path / "source.txt" src_file.write_text("test content") - return Media(key="subdir/test.txt", filename=str(src_file)) + return Media(filename=str(src_file)) -def test_really_long_website_url_save(local_storage, tmp_path): - long_filename = os.path.join(local_storage.save_to, "file"*100 + ".txt") - src_file = tmp_path / "source.txt" - src_file.write_text("test content") - media = Media(key=long_filename, filename=str(src_file)) - assert local_storage.upload(media) is True - assert src_file.read_text() == Path(local_storage.get_cdn_url(media)).read_text() +def test_too_long_save_path(setup_module): + with pytest.raises(SetupError): + setup_module("local_storage", {"save_to": "long"*100}) def test_get_cdn_url_relative(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + media = Media(filename="dummy.txt") + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.join(local_storage.save_to, media.key) assert local_storage.get_cdn_url(media) == expected - - def test_get_cdn_url_absolute(local_storage): - media = Media(key="test.txt", filename="dummy.txt") + media = Media(filename="dummy.txt") local_storage.save_absolute = True + local_storage.set_key(media, "https://example.com", Metadata()) expected = os.path.abspath(os.path.join(local_storage.save_to, media.key)) assert local_storage.get_cdn_url(media) == expected def test_upload_file_contents_and_metadata(local_storage, sample_media): + local_storage.store(sample_media, "https://example.com", Metadata()) dest = os.path.join(local_storage.save_to, sample_media.key) - assert local_storage.upload(sample_media) is True assert Path(sample_media.filename).read_text() == Path(dest).read_text() - def test_upload_nonexistent_source(local_storage): - media = Media(key="missing.txt", filename="nonexistent.txt") + media = Media(_key="missing.txt", filename="nonexistent.txt") with pytest.raises(FileNotFoundError): local_storage.upload(media) diff --git a/tests/storages/test_storage_base.py b/tests/storages/test_storage_base.py index fd07c32..30a91b9 100644 --- a/tests/storages/test_storage_base.py +++ b/tests/storages/test_storage_base.py @@ -54,7 +54,7 @@ def storage_base(): ], ) -def test_storage_setup(storage_base, path_generator, filename_generator, url, expected_key, mocker): +def test_storage_name_generation(storage_base, path_generator, filename_generator, url, expected_key, mocker): mock_random = mocker.patch("auto_archiver.core.storage.random_str") mock_random.return_value = "pretend-random" @@ -92,7 +92,4 @@ def test_really_long_name(storage_base): url = f"https://example.com/{'file'*100}" media = Media(filename="dummy.txt") storage.set_key(media, url, Metadata()) - assert len(media.key) <= storage.max_file_length() - assert media.key == "https-example-com-filefilefilefilefilefilefilefilefilefilefilefile\ -filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile\ -filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file + assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt" \ No newline at end of file