Merge pull request #243 from bellingcat/fix-long-path-names

Unit tests for storage types + fix storage too long issues for local storage
This commit is contained in:
Patrick Robertson
2025-03-11 10:05:09 +00:00
committed by GitHub
15 changed files with 180 additions and 51 deletions

View File

@@ -15,7 +15,6 @@ class CLIFeeder(Feeder):
for url in urls:
logger.debug(f"Processing {url}")
m = Metadata().set_url(url)
m.set_context("folder", "cli")
yield m
logger.success(f"Processed {len(urls)} URL(s)")

View File

@@ -6,25 +6,42 @@ from loguru import logger
from auto_archiver.core import Media
from auto_archiver.core import Storage
from auto_archiver.core.consts import SetupError
class LocalStorage(Storage):
def setup(self) -> None:
if len(self.save_to) > 200:
raise SetupError(f"Your save_to path is too long, this will cause issues saving files on your computer. Please use a shorter path.")
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)
dest = media.key
if self.save_absolute:
dest = os.path.abspath(dest)
return dest
def set_key(self, media, url, metadata):
# clarify we want to save the file to the save_to folder
old_folder = metadata.get('folder', '')
metadata.set_context('folder', os.path.join(self.save_to, metadata.get('folder', '')))
super().set_key(media, url, metadata)
# don't impact other storages that might want a different 'folder' set
metadata.set_context('folder', old_folder)
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
dest = os.path.join(self.save_to, media.key)
dest = media.key
os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.__name__}] storing file {media.filename} with key {media.key} to {dest}')
res = shutil.copy2(media.filename, dest)
logger.info(res)
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass

View File

@@ -42,7 +42,7 @@ class S3Storage(Storage):
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def is_upload_needed(self, media: Media) -> bool:
if self.random_no_duplicate:
# checks if a folder with the hash already exists, if so it skips the upload
@@ -50,13 +50,13 @@ class S3Storage(Storage):
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
if existing_key:=self.file_in_folder(path):
media.key = existing_key
media._key = existing_key
media.set("previously archived", True)
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
return False
_, ext = os.path.splitext(media.key)
media.key = os.path.join(path, f"{random_str(24)}{ext}")
media._key = os.path.join(path, f"{random_str(24)}{ext}")
return True
def file_in_folder(self, path:str) -> str:
@@ -66,5 +66,4 @@ class S3Storage(Storage):
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
if 'Contents' in resp:
return resp['Contents'][0]['Key']
return False
return False