Initial Atlos merge

This commit is contained in:
erinhmclark
2025-03-05 10:24:54 +00:00
parent 22932645aa
commit 6cb7afefdc
3 changed files with 93 additions and 34 deletions

View File

@@ -1,7 +1,7 @@
{
"name": "Atlos Feeder Database",
"type": ["feeder", "database"],
"entry_point": "atlos_feeder_db::AtlosFeederDb",
"name": "Atlos Feeder Database Storage",
"type": ["feeder", "database", "storage"],
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
"requires_setup": True,
"dependencies": {
"python": ["loguru", "requests"],
@@ -19,11 +19,9 @@
},
},
"description": """
AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival,
AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
along with a database option to output archival results.
Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
### Features
- Connects to the Atlos API to retrieve a list of source material URLs.
- Filters source materials based on visibility, processing status, and metadata.
@@ -33,6 +31,7 @@
- Updates failure status with error details when archiving fails.
- Processes and formats metadata, including ISO formatting for datetime fields.
- Skips processing for items without an Atlos ID.
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
### Notes
- Requires an Atlos API endpoint and a valid API token for authentication.

View File

@@ -1,15 +1,19 @@
import requests
import hashlib
import os
from typing import IO, Optional
from typing import Union
import requests
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Feeder
from auto_archiver.core import Media
from auto_archiver.core import Metadata
from auto_archiver.core import Storage
class AtlosFeederDb(Feeder, Database):
class AtlosFeederDbStorage(Feeder, Database, Storage):
def __iter__(self) -> Metadata:
# Get all the urls from the Atlos API
@@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database):
logger.info(
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
)
def get_cdn_url(self, _media: Media) -> str:
# It's not always possible to provide an exact URL, because it's
# possible that the media once uploaded could have been copied to
# another project.
return self.atlos_url
def _hash(self, media: Media) -> str:
# Hash the media file using sha-256. We don't use the existing auto archiver
# hash because there's no guarantee that the configuerer is using sha-256, which
# is how Atlos hashes files.
sha256 = hashlib.sha256()
with open(media.filename, "rb") as f:
while True:
buf = f.read(4096)
if not buf: break
sha256.update(buf)
return sha256.hexdigest()
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
atlos_id = metadata.get("atlos_id")
if atlos_id is None:
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
return False
media_hash = self._hash(media)
# media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
# Check whether the media has already been uploaded
source_material = requests.get(
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
headers={"Authorization": f"Bearer {self.api_token}"},
).json()["result"]
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
if media_hash in existing_media:
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
return True
# Upload the media to the Atlos API
requests.post(
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
headers={"Authorization": f"Bearer {self.api_token}"},
params={
"title": media.properties
},
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
).raise_for_status()
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass