mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 04:38:29 +03:00
Refactoring of storage code:
1. Fix some bugs in local_storage 2. Refactor unit tests to not set Media.key explicitly (unless it's well-known beforehand, which it isn't) 3. Limit length of URL for 'url' type path_generator 4. Throw an error if 'save_to' of local storage is too long 5. A few other tidyups
This commit is contained in:
@@ -14,7 +14,6 @@ from loguru import logger
|
||||
from copy import deepcopy
|
||||
from auto_archiver.core.consts import MODULE_TYPES
|
||||
|
||||
from typing import Any, List, Type, Tuple
|
||||
|
||||
_yaml: YAML = YAML()
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
|
||||
MODULE_TYPES = [
|
||||
'feeder',
|
||||
|
||||
@@ -81,7 +81,8 @@ class Extractor(BaseModule):
|
||||
if len(to_filename) > 64:
|
||||
to_filename = to_filename[-64:]
|
||||
to_filename = os.path.join(self.tmp_dir, to_filename)
|
||||
if verbose: logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
if verbose:
|
||||
logger.debug(f"downloading {url[0:50]=} {to_filename=}")
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
||||
}
|
||||
|
||||
@@ -21,14 +21,13 @@ class Media:
|
||||
Represents a media file with associated properties and storage details.
|
||||
|
||||
Attributes:
|
||||
- filename: The file path of the media.
|
||||
- key: An optional identifier for the media.
|
||||
- filename: The file path of the media as saved locally (temporarily, before uploading to the storage).
|
||||
- urls: A list of URLs where the media is stored or accessible.
|
||||
- properties: Additional metadata or transformations for the media.
|
||||
- _mimetype: The media's mimetype (e.g., image/jpeg, video/mp4).
|
||||
"""
|
||||
filename: str
|
||||
key: str = None
|
||||
_key: str = None
|
||||
urls: List[str] = field(default_factory=list)
|
||||
properties: dict = field(default_factory=dict)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
@@ -67,6 +66,10 @@ class Media:
|
||||
# checks if the media is already stored in the given storage
|
||||
return len(self.urls) > 0 and len(self.urls) == len(in_storage.config["steps"]["storages"])
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
return self._key
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
return self
|
||||
|
||||
@@ -23,15 +23,13 @@ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, is_vali
|
||||
DefaultValidatingParser, UniqueAppendAction, AuthenticationJsonParseAction, DEFAULT_CONFIG_FILE
|
||||
from .module import ModuleFactory, LazyBaseModule
|
||||
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
|
||||
from .consts import MODULE_TYPES
|
||||
from .consts import MODULE_TYPES, SetupError
|
||||
from auto_archiver.utils.url import check_url_or_raise
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
from .module import LazyBaseModule
|
||||
|
||||
class SetupError(ValueError):
|
||||
pass
|
||||
class ArchivingOrchestrator:
|
||||
|
||||
# instance variables
|
||||
|
||||
@@ -6,6 +6,7 @@ from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from typing import IO
|
||||
import os
|
||||
import platform
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
@@ -43,17 +44,30 @@ class Storage(BaseModule):
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
"""
|
||||
Uploads (or saves) a file to the storage service/location.
|
||||
|
||||
This method should not be called directly, but instead through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
"""
|
||||
Uploads a media object to the storage service.
|
||||
|
||||
This method should not be called directly, but instead be called through the 'store' method,
|
||||
which sets up the media for storage.
|
||||
"""
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url: str, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
|
||||
if media.key is not None and len(media.key) > 0:
|
||||
# media key is already set
|
||||
return
|
||||
|
||||
folder = metadata.get_context('folder', '')
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
@@ -61,10 +75,8 @@ class Storage(BaseModule):
|
||||
path_generator = self.path_generator
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
# TODO: this is never used
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
path = slugify(url)[:70]
|
||||
elif path_generator == "random":
|
||||
path = random_str(24)
|
||||
else:
|
||||
@@ -81,25 +93,7 @@ class Storage(BaseModule):
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
if len(key) > self.max_file_length():
|
||||
# truncate the path
|
||||
max_path_length = self.max_file_length() - len(filename) - len(ext) - len(folder) - 1
|
||||
path = path[:max_path_length]
|
||||
logger.warning(f'Filename too long ({len(key)} characters), truncating to {self.max_file_length()} characters')
|
||||
key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
|
||||
media.key = key
|
||||
|
||||
|
||||
def max_file_length(self) -> int:
|
||||
"""
|
||||
Returns the maximum length of a file name that can be stored in the storage service.
|
||||
|
||||
Files are truncated if they exceed this length.
|
||||
Override this method in subclasses if the storage service has a different maximum file length.
|
||||
"""
|
||||
return 255 # safe max file length for most filesystems (macOS, Windows, Linux)
|
||||
|
||||
media._key = key
|
||||
@@ -15,7 +15,6 @@ class CLIFeeder(Feeder):
|
||||
for url in urls:
|
||||
logger.debug(f"Processing {url}")
|
||||
m = Metadata().set_url(url)
|
||||
m.set_context("folder", "cli")
|
||||
yield m
|
||||
|
||||
logger.success(f"Processed {len(urls)} URL(s)")
|
||||
@@ -6,37 +6,34 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
class LocalStorage(Storage):
|
||||
|
||||
MAX_FILE_LENGTH = 255
|
||||
|
||||
def setup(self) -> None:
|
||||
if len(self.save_to) > 200:
|
||||
raise SetupError(f"Your save_to path is long, this will cause issues saving files on your computer. Please use a shorter path")
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
# TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
dest = media.key
|
||||
|
||||
if self.save_absolute:
|
||||
dest = os.path.abspath(dest)
|
||||
return dest
|
||||
|
||||
def set_key(self, media, url, metadata):
|
||||
# clarify we want to save the file to the save_to folder
|
||||
|
||||
old_folder = metadata.get('folder', '')
|
||||
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
|
||||
super().set_key(media, url, metadata)
|
||||
# don't impact other storages that might want a different 'folder' set
|
||||
metadata.set_context('folder', old_folder)
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
|
||||
if len(dest) > self.max_file_length():
|
||||
old_dest_length = len(dest)
|
||||
filename, ext = os.path.splitext(media.key)
|
||||
dir, filename = os.path.split(filename)
|
||||
# see whether we should truncate filename or dir
|
||||
if len(dir) > len(filename):
|
||||
dir = dir[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1]
|
||||
else:
|
||||
filename = filename[:self.MAX_FILE_LENGTH - len(self.save_to) - len(ext) - len(filename) - 1]
|
||||
|
||||
# override media.key
|
||||
media.key = os.path.join(dir, f"{filename}{ext}")
|
||||
dest = os.path.join(self.save_to, dir, f"{filename}{ext}")
|
||||
logger.warning(f'Filename too long ({old_dest_length} characters), truncating to {len(dest)} characters')
|
||||
dest = media.key
|
||||
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
@@ -46,7 +43,5 @@ class LocalStorage(Storage):
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
|
||||
def max_file_length(self):
|
||||
return self.MAX_FILE_LENGTH
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
@@ -42,7 +42,7 @@ class S3Storage(Storage):
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
@@ -50,13 +50,13 @@ class S3Storage(Storage):
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
media.key = existing_key
|
||||
media._key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
media._key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
@@ -66,9 +66,4 @@ class S3Storage(Storage):
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
return False
|
||||
|
||||
def max_file_length(self):
|
||||
# Amazon AWS max file length is 1024, but we will use 1000 to be safe
|
||||
return 1000
|
||||
|
||||
return False
|
||||
@@ -14,8 +14,8 @@ def enricher(setup_module):
|
||||
def metadata_with_images():
|
||||
m = Metadata()
|
||||
m.set_url("https://example.com")
|
||||
m.add_media(Media(filename="image1.jpg", key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", key="image2"))
|
||||
m.add_media(Media(filename="image1.jpg", _key="image1"))
|
||||
m.add_media(Media(filename="image2.jpg", _key="image2"))
|
||||
return m
|
||||
|
||||
|
||||
|
||||
@@ -37,10 +37,10 @@ class TestS3Storage:
|
||||
def test_get_cdn_url_generation(self):
|
||||
"""Test CDN URL formatting """
|
||||
media = Media("test.txt")
|
||||
media.key = "path/to/file.txt"
|
||||
media._key = "path/to/file.txt"
|
||||
url = self.storage.get_cdn_url(media)
|
||||
assert url == "https://cdn.example.com/path/to/file.txt"
|
||||
media.key = "another/path.jpg"
|
||||
media._key = "another/path.jpg"
|
||||
assert self.storage.get_cdn_url(media) == "https://cdn.example.com/another/path.jpg"
|
||||
|
||||
def test_uploadf_sets_acl_public(self, mocker):
|
||||
@@ -72,7 +72,7 @@ class TestS3Storage:
|
||||
self.storage.random_no_duplicate = True
|
||||
mock_file_in_folder = mocker.patch.object(S3Storage, 'file_in_folder', return_value="existing_folder/existing_file.txt")
|
||||
media = Media("test.txt")
|
||||
media.key = "original_path.txt"
|
||||
media._key = "original_path.txt"
|
||||
mock_calculate_hash = mocker.patch('auto_archiver.modules.s3_storage.s3_storage.calculate_file_hash', return_value="beepboop123beepboop123beepboop123")
|
||||
assert self.storage.is_upload_needed(media) is False
|
||||
assert media.key == "existing_folder/existing_file.txt"
|
||||
@@ -84,7 +84,7 @@ class TestS3Storage:
|
||||
|
||||
def test_uploads_with_correct_parameters(self, mocker):
|
||||
media = Media("test.txt")
|
||||
media.key = "original_key.txt"
|
||||
media._key = "original_key.txt"
|
||||
mocker.patch.object(S3Storage, 'is_upload_needed', return_value=True)
|
||||
media.mimetype = 'image/png'
|
||||
mock_file = mocker.MagicMock()
|
||||
|
||||
@@ -44,7 +44,7 @@ def media(tmp_path) -> Media:
|
||||
file_path.write_bytes(content)
|
||||
media = Media(filename=str(file_path))
|
||||
media.properties = {"something": "Title"}
|
||||
media.key = "key"
|
||||
media._key = "key"
|
||||
return media
|
||||
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ def test_get_id_from_parent_and_name(gdrive_storage, mocker):
|
||||
|
||||
def test_path_parts():
|
||||
media = Media(filename="test.jpg")
|
||||
media.key = "folder1/folder2/test.jpg"
|
||||
media._key = "folder1/folder2/test.jpg"
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -4,9 +4,9 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Media, Metadata
|
||||
from auto_archiver.modules.local_storage import LocalStorage
|
||||
|
||||
from auto_archiver.core.consts import SetupError
|
||||
|
||||
@pytest.fixture
|
||||
def local_storage(setup_module, tmp_path) -> LocalStorage:
|
||||
@@ -25,37 +25,32 @@ def sample_media(tmp_path) -> Media:
|
||||
"""Fixture creating a Media object with temporary source file"""
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
return Media(key="subdir/test.txt", filename=str(src_file))
|
||||
return Media(filename=str(src_file))
|
||||
|
||||
def test_really_long_website_url_save(local_storage, tmp_path):
|
||||
long_filename = os.path.join(local_storage.save_to, "file"*100 + ".txt")
|
||||
src_file = tmp_path / "source.txt"
|
||||
src_file.write_text("test content")
|
||||
media = Media(key=long_filename, filename=str(src_file))
|
||||
assert local_storage.upload(media) is True
|
||||
assert src_file.read_text() == Path(local_storage.get_cdn_url(media)).read_text()
|
||||
def test_too_long_save_path(setup_module):
|
||||
with pytest.raises(SetupError):
|
||||
setup_module("local_storage", {"save_to": "long"*100})
|
||||
|
||||
def test_get_cdn_url_relative(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.join(local_storage.save_to, media.key)
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
|
||||
|
||||
def test_get_cdn_url_absolute(local_storage):
|
||||
media = Media(key="test.txt", filename="dummy.txt")
|
||||
media = Media(filename="dummy.txt")
|
||||
local_storage.save_absolute = True
|
||||
local_storage.set_key(media, "https://example.com", Metadata())
|
||||
expected = os.path.abspath(os.path.join(local_storage.save_to, media.key))
|
||||
assert local_storage.get_cdn_url(media) == expected
|
||||
|
||||
def test_upload_file_contents_and_metadata(local_storage, sample_media):
|
||||
local_storage.store(sample_media, "https://example.com", Metadata())
|
||||
dest = os.path.join(local_storage.save_to, sample_media.key)
|
||||
assert local_storage.upload(sample_media) is True
|
||||
assert Path(sample_media.filename).read_text() == Path(dest).read_text()
|
||||
|
||||
|
||||
def test_upload_nonexistent_source(local_storage):
|
||||
media = Media(key="missing.txt", filename="nonexistent.txt")
|
||||
media = Media(_key="missing.txt", filename="nonexistent.txt")
|
||||
with pytest.raises(FileNotFoundError):
|
||||
local_storage.upload(media)
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ def storage_base():
|
||||
|
||||
],
|
||||
)
|
||||
def test_storage_setup(storage_base, path_generator, filename_generator, url, expected_key, mocker):
|
||||
def test_storage_name_generation(storage_base, path_generator, filename_generator, url, expected_key, mocker):
|
||||
mock_random = mocker.patch("auto_archiver.core.storage.random_str")
|
||||
mock_random.return_value = "pretend-random"
|
||||
|
||||
@@ -92,7 +92,4 @@ def test_really_long_name(storage_base):
|
||||
url = f"https://example.com/{'file'*100}"
|
||||
media = Media(filename="dummy.txt")
|
||||
storage.set_key(media, url, Metadata())
|
||||
assert len(media.key) <= storage.max_file_length()
|
||||
assert media.key == "https-example-com-filefilefilefilefilefilefilefilefilefilefilefile\
|
||||
filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile\
|
||||
filefilefilefilefilefilefilefilefilefilefilefilefilefilefilefilefile/6ae8a75555209fd6c44157c0.txt"
|
||||
assert media.key == f"https-example-com-{'file'*13}/6ae8a75555209fd6c44157c0.txt"
|
||||
Reference in New Issue
Block a user