From 6cb7afefdc38db53296fc9ccd9b20d9fbd3c5704 Mon Sep 17 00:00:00 2001 From: erinhmclark Date: Wed, 5 Mar 2025 10:24:54 +0000 Subject: [PATCH] Initial Atlos merge --- poetry.lock | 50 +++++++------- .../__manifest__.py | 11 ++-- .../atlos_feeder_db_storage.py} | 66 ++++++++++++++++++- 3 files changed, 93 insertions(+), 34 deletions(-) rename src/auto_archiver/modules/{atlos_feeder_db => atlos_feeder_db_storage}/__manifest__.py (79%) rename src/auto_archiver/modules/{atlos_feeder_db/atlos_feeder_database.py => atlos_feeder_db_storage/atlos_feeder_db_storage.py} (58%) diff --git a/poetry.lock b/poetry.lock index 2855bb5..16ec2f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] [[package]] name = "authlib" -version = "1.5.0" +version = "1.5.1" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "Authlib-1.5.0-py2.py3-none-any.whl", hash = "sha256:b3cc5ccfc19cf87678046b6e7cb19d402d8a631a33c40e36385232203227953a"}, - {file = "authlib-1.5.0.tar.gz", hash = "sha256:8fd8bd8f806485a532ac39a17b579982cf54688f956174f995cc938a91725423"}, + {file = "authlib-1.5.1-py2.py3-none-any.whl", hash = "sha256:8408861cbd9b4ea2ff759b00b6f02fd7d81ac5a56d0b2b22c08606c6049aae11"}, + {file = "authlib-1.5.1.tar.gz", hash = "sha256:5cbc85ecb0667312c1cdc2f9095680bb735883b123fb509fde1e65b1c5df972e"}, ] [package.dependencies] @@ -172,18 +172,18 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.37.0" +version = "1.37.5" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "boto3-1.37.0-py3-none-any.whl", hash = "sha256:03bd8c93b226f07d944fd6b022e11a307bff94ab6a21d51675d7e3ea81ee8424"}, - {file = "boto3-1.37.0.tar.gz", hash = "sha256:01015b38017876d79efd7273f35d9a4adfba505237159621365bed21b9b65eca"}, + {file = "boto3-1.37.5-py3-none-any.whl", hash = "sha256:12166353519aca0cc8d9dcfbbb0d38f8915955a5912b8cb241b2b2314f0dbc14"}, + {file = "boto3-1.37.5.tar.gz", hash = "sha256:ae6e7048beeaa4478368e554a4b290e3928beb0ae8d8767d108d72381a81af30"}, ] [package.dependencies] -botocore = ">=1.37.0,<1.38.0" +botocore = ">=1.37.5,<1.38.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.11.0,<0.12.0" @@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.37.0" +version = "1.37.5" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "botocore-1.37.0-py3-none-any.whl", hash = "sha256:d01661f38c0edac87424344cdf4169f3ab9bc1bf1b677c8b230d025eb66c54a3"}, - {file = "botocore-1.37.0.tar.gz", hash = "sha256:b129d091a8360b4152ab65327186bf4e250de827c4a9b7ddf40a72b1acf1f3c1"}, + {file = "botocore-1.37.5-py3-none-any.whl", hash = "sha256:e5cfbb8026d5b4fadd9b3a18b61d238a41a8b8f620ab75873dc1467d456150d6"}, + {file = "botocore-1.37.5.tar.gz", hash = "sha256:f8f526d33ae74d242c577e0440b57b9ec7d53edd41db211155ec8087fe7a5a21"}, ] [package.dependencies] @@ -781,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-api-python-client" -version = "2.161.0" +version = "2.162.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"}, - {file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"}, + {file = "google_api_python_client-2.162.0-py2.py3-none-any.whl", hash = "sha256:49365fa4f7795fe81a747f5544d6528ea94314fa59664e0ea1005f603facf1ec"}, + {file = "google_api_python_client-2.162.0.tar.gz", hash = "sha256:5f8bc934a5b6eea73a7d12d999e6585c1823179f48340234acb385e2502e735a"}, ] [package.dependencies] @@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"] [[package]] name = "googleapis-common-protos" -version = "1.68.0" +version = "1.69.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" groups = ["main"] files = [ - {file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"}, - {file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"}, + {file = "googleapis_common_protos-1.69.0-py2.py3-none-any.whl", hash = "sha256:17835fdc4fa8da1d61cfe2d4d5d57becf7c61d4112f8d81c67eaa9d7ce43042d"}, + {file = "googleapis_common_protos-1.69.0.tar.gz", hash = "sha256:5a46d58af72846f59009b9c4710425b9af2139555c71837081706b213b298187"}, ] [package.dependencies] @@ -878,14 +878,14 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] [[package]] name = "gspread" -version = "6.1.4" +version = "6.2.0" description = "Google Spreadsheets Python API" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "gspread-6.1.4-py3-none-any.whl", hash = "sha256:c34781c426031a243ad154952b16f21ac56a5af90687885fbee3d1fba5280dcd"}, - {file = "gspread-6.1.4.tar.gz", hash = "sha256:b8eec27de7cadb338bb1b9f14a9be168372dee8965c0da32121816b5050ac1de"}, + {file = "gspread-6.2.0-py3-none-any.whl", hash = "sha256:7fa1a11e1ecacc6c5946fa016be05941baca8540404314f59aec963dd8ae5db3"}, + {file = "gspread-6.2.0.tar.gz", hash = "sha256:bc3d02d1c39e0b40bfc8035b4fec407aa71a17f343fc81cc7e3f75bfa6555de6"}, ] [package.dependencies] @@ -1777,14 +1777,14 @@ files = [ [[package]] name = "pytest" -version = "8.3.4" +version = "8.3.5" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" groups = ["dev"] files = [ - {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, - {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, + {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"}, + {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"}, ] [package.dependencies] @@ -2248,14 +2248,14 @@ files = [ [[package]] name = "s3transfer" -version = "0.11.2" +version = "0.11.3" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, - {file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, + {file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"}, + {file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"}, ] [package.dependencies] diff --git a/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py similarity index 79% rename from src/auto_archiver/modules/atlos_feeder_db/__manifest__.py rename to src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py index 54222f6..3920246 100644 --- a/src/auto_archiver/modules/atlos_feeder_db/__manifest__.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/__manifest__.py @@ -1,7 +1,7 @@ { - "name": "Atlos Feeder Database", - "type": ["feeder", "database"], -"entry_point": "atlos_feeder_db::AtlosFeederDb", + "name": "Atlos Feeder Database Storage", + "type": ["feeder", "database", "storage"], +"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage", "requires_setup": True, "dependencies": { "python": ["loguru", "requests"], @@ -19,11 +19,9 @@ }, }, "description": """ - AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival, + AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media, along with a database option to output archival results. - Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival. - ### Features - Connects to the Atlos API to retrieve a list of source material URLs. - Filters source materials based on visibility, processing status, and metadata. @@ -33,6 +31,7 @@ - Updates failure status with error details when archiving fails. - Processes and formats metadata, including ISO formatting for datetime fields. - Skips processing for items without an Atlos ID. + - Saves media files to Atlos, organizing them into folders based on the provided path structure. ### Notes - Requires an Atlos API endpoint and a valid API token for authentication. diff --git a/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py similarity index 58% rename from src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py rename to src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py index 4bd3368..7bcd74e 100644 --- a/src/auto_archiver/modules/atlos_feeder_db/atlos_feeder_database.py +++ b/src/auto_archiver/modules/atlos_feeder_db_storage/atlos_feeder_db_storage.py @@ -1,15 +1,19 @@ -import requests +import hashlib +import os +from typing import IO, Optional from typing import Union +import requests from loguru import logger from auto_archiver.core import Database - from auto_archiver.core import Feeder +from auto_archiver.core import Media from auto_archiver.core import Metadata +from auto_archiver.core import Storage -class AtlosFeederDb(Feeder, Database): +class AtlosFeederDbStorage(Feeder, Database, Storage): def __iter__(self) -> Metadata: # Get all the urls from the Atlos API @@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database): logger.info( f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" ) + + def get_cdn_url(self, _media: Media) -> str: + # It's not always possible to provide an exact URL, because it's + # possible that the media once uploaded could have been copied to + # another project. + return self.atlos_url + + def _hash(self, media: Media) -> str: + # Hash the media file using sha-256. We don't use the existing auto archiver + # hash because there's no guarantee that the configuerer is using sha-256, which + # is how Atlos hashes files. + + sha256 = hashlib.sha256() + with open(media.filename, "rb") as f: + while True: + buf = f.read(4096) + if not buf: break + sha256.update(buf) + return sha256.hexdigest() + + def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool: + atlos_id = metadata.get("atlos_id") + if atlos_id is None: + logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos") + return False + + media_hash = self._hash(media) + # media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096) + + # Check whether the media has already been uploaded + source_material = requests.get( + f"{self.atlos_url}/api/v2/source_material/{atlos_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + ).json()["result"] + existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])] + if media_hash in existing_media: + logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos") + return True + + # Upload the media to the Atlos API + requests.post( + f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}", + headers={"Authorization": f"Bearer {self.api_token}"}, + params={ + "title": media.properties + }, + files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))}, + ).raise_for_status() + + logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}") + + return True + + # must be implemented even if unused + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: + pass