mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Move storage configs into individual manifests, assert format on useage.
This commit is contained in:
@@ -15,29 +15,6 @@ from slugify import slugify
|
||||
@dataclass
|
||||
class Storage(Step):
|
||||
name = "storage"
|
||||
PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
|
||||
FILENAME_GENERATOR_CHOICES = ["random", "static"]
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
|
||||
assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": Storage.PATH_GENERATOR_OPTIONS
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": Storage.FILENAME_GENERATOR_CHOICES
|
||||
}
|
||||
}
|
||||
|
||||
def init(name: str, config: dict) -> Storage:
|
||||
# only for typing...
|
||||
@@ -68,19 +45,27 @@ class Storage(Step):
|
||||
folder = ArchivingContext.get("folder", "")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# path_generator logic
|
||||
if self.path_generator == "flat":
|
||||
# Handle path_generator logic
|
||||
path_generator = ArchivingContext.get("path_generator", "url")
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(url)
|
||||
elif self.path_generator == "random":
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
elif path_generator == "random":
|
||||
path = ArchivingContext.get("random_path", random_str(24), True)
|
||||
else:
|
||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||
|
||||
# filename_generator logic
|
||||
if self.filename_generator == "random": filename = random_str(24)
|
||||
elif self.filename_generator == "static":
|
||||
# Handle filename_generator logic
|
||||
filename_generator = ArchivingContext.get("filename_generator", "random")
|
||||
if filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
raise ValueError(f"Invalid filename_generator: {filename_generator}")
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
||||
|
||||
@@ -2,23 +2,25 @@
|
||||
"name": "atlos_storage",
|
||||
"type": ["storage"],
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
"bin": [""]
|
||||
},
|
||||
"external_dependencies": {"python": ["loguru", "requests"], "bin": [""]},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
# TODO also? get_atlos_config_options()
|
||||
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
"api_token": {
|
||||
"default": None,
|
||||
"help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"cli_set": lambda cli_val, _: cli_val,
|
||||
},
|
||||
"atlos_url": {
|
||||
"default": "https://platform.atlos.org",
|
||||
"help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
|
||||
"cli_set": lambda cli_val, _: cli_val
|
||||
"cli_set": lambda cli_val, _: cli_val,
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
@@ -34,5 +36,5 @@
|
||||
### Notes
|
||||
- Requires Atlos API configuration, including `atlos_url` and `api_token`.
|
||||
- Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
|
||||
"""
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -12,6 +12,14 @@ m = {
|
||||
],
|
||||
},
|
||||
"configs": {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
# TODO: get base storage configs
|
||||
"root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
|
||||
"oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
|
||||
|
||||
@@ -6,7 +6,14 @@ m = {
|
||||
"python": ["loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
},
|
||||
|
||||
@@ -6,7 +6,14 @@ m = {
|
||||
"python": ["boto3", "loguru"],
|
||||
},
|
||||
"configs": {
|
||||
# TODO: get base storage configs
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
},
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
|
||||
Reference in New Issue
Block a user