mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
More manifests, base modules and rename from archiver to extractor.
This commit is contained in:
0
src/auto_archiver/modules/s3/__init__.py
Normal file
0
src/auto_archiver/modules/s3/__init__.py
Normal file
40
src/auto_archiver/modules/s3/__manifest__.py
Normal file
40
src/auto_archiver/modules/s3/__manifest__.py
Normal file
@@ -0,0 +1,40 @@
|
||||
m = {
|
||||
"name": "S3 Storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
},
|
||||
"description": """
|
||||
S3Storage: A storage module for saving media files to an S3-compatible object storage.
|
||||
|
||||
### Features
|
||||
- Uploads media files to an S3 bucket with customizable configurations.
|
||||
- Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes.
|
||||
- Automatically generates unique paths for files when duplicates are found.
|
||||
- Configurable endpoint and CDN URL for different S3-compatible providers.
|
||||
- Supports both private and public file storage, with public files being readable online.
|
||||
|
||||
### Notes
|
||||
- Requires S3 credentials (API key and secret) and a bucket name to function.
|
||||
- The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
|
||||
- Uses `boto3` for interaction with the S3 API.
|
||||
"""
|
||||
}
|
||||
96
src/auto_archiver/modules/s3/s3.py
Normal file
96
src/auto_archiver/modules/s3/s3.py
Normal file
@@ -0,0 +1,96 @@
|
||||
|
||||
from typing import IO
|
||||
import boto3, os
|
||||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.base_modules import Storage
|
||||
# TODO
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from loguru import logger
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
class S3Storage(Storage):
|
||||
name = "s3_storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
self.s3 = boto3.client(
|
||||
's3',
|
||||
region_name=self.region,
|
||||
endpoint_url=self.endpoint_url.format(region=self.region),
|
||||
aws_access_key_id=self.key,
|
||||
aws_secret_access_key=self.secret
|
||||
)
|
||||
self.random_no_duplicate = bool(self.random_no_duplicate)
|
||||
if self.random_no_duplicate:
|
||||
logger.warning("random_no_duplicate is set to True, this will override `path_generator`, `filename_generator` and `folder`.")
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||
if not self.is_upload_needed(media): return True
|
||||
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
if media.mimetype:
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": "SHA-256", "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
media.key = existing_key
|
||||
media.set("previously archived", True)
|
||||
logger.debug(f"skipping upload of {media.filename} because it already exists in {media.key}")
|
||||
return False
|
||||
|
||||
_, ext = os.path.splitext(media.key)
|
||||
media.key = os.path.join(path, f"{random_str(24)}{ext}")
|
||||
return True
|
||||
|
||||
|
||||
def file_in_folder(self, path:str) -> str:
|
||||
# checks if path exists and is not an empty folder
|
||||
if not path.endswith('/'):
|
||||
path = path + '/'
|
||||
resp = self.s3.list_objects(Bucket=self.bucket, Prefix=path, Delimiter='/', MaxKeys=1)
|
||||
if 'Contents' in resp:
|
||||
return resp['Contents'][0]['Key']
|
||||
return False
|
||||
|
||||
Reference in New Issue
Block a user