Initial Atlos merge

This commit is contained in:
erinhmclark
2025-03-05 10:24:54 +00:00
parent 22932645aa
commit 6cb7afefdc
3 changed files with 93 additions and 34 deletions

50
poetry.lock generated
View File

@@ -103,14 +103,14 @@ tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
[[package]] [[package]]
name = "authlib" name = "authlib"
version = "1.5.0" version = "1.5.1"
description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients."
optional = false optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "Authlib-1.5.0-py2.py3-none-any.whl", hash = "sha256:b3cc5ccfc19cf87678046b6e7cb19d402d8a631a33c40e36385232203227953a"}, {file = "authlib-1.5.1-py2.py3-none-any.whl", hash = "sha256:8408861cbd9b4ea2ff759b00b6f02fd7d81ac5a56d0b2b22c08606c6049aae11"},
{file = "authlib-1.5.0.tar.gz", hash = "sha256:8fd8bd8f806485a532ac39a17b579982cf54688f956174f995cc938a91725423"}, {file = "authlib-1.5.1.tar.gz", hash = "sha256:5cbc85ecb0667312c1cdc2f9095680bb735883b123fb509fde1e65b1c5df972e"},
] ]
[package.dependencies] [package.dependencies]
@@ -172,18 +172,18 @@ lxml = ["lxml"]
[[package]] [[package]]
name = "boto3" name = "boto3"
version = "1.37.0" version = "1.37.5"
description = "The AWS SDK for Python" description = "The AWS SDK for Python"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "boto3-1.37.0-py3-none-any.whl", hash = "sha256:03bd8c93b226f07d944fd6b022e11a307bff94ab6a21d51675d7e3ea81ee8424"}, {file = "boto3-1.37.5-py3-none-any.whl", hash = "sha256:12166353519aca0cc8d9dcfbbb0d38f8915955a5912b8cb241b2b2314f0dbc14"},
{file = "boto3-1.37.0.tar.gz", hash = "sha256:01015b38017876d79efd7273f35d9a4adfba505237159621365bed21b9b65eca"}, {file = "boto3-1.37.5.tar.gz", hash = "sha256:ae6e7048beeaa4478368e554a4b290e3928beb0ae8d8767d108d72381a81af30"},
] ]
[package.dependencies] [package.dependencies]
botocore = ">=1.37.0,<1.38.0" botocore = ">=1.37.5,<1.38.0"
jmespath = ">=0.7.1,<2.0.0" jmespath = ">=0.7.1,<2.0.0"
s3transfer = ">=0.11.0,<0.12.0" s3transfer = ">=0.11.0,<0.12.0"
@@ -192,14 +192,14 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
[[package]] [[package]]
name = "botocore" name = "botocore"
version = "1.37.0" version = "1.37.5"
description = "Low-level, data-driven core of boto 3." description = "Low-level, data-driven core of boto 3."
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "botocore-1.37.0-py3-none-any.whl", hash = "sha256:d01661f38c0edac87424344cdf4169f3ab9bc1bf1b677c8b230d025eb66c54a3"}, {file = "botocore-1.37.5-py3-none-any.whl", hash = "sha256:e5cfbb8026d5b4fadd9b3a18b61d238a41a8b8f620ab75873dc1467d456150d6"},
{file = "botocore-1.37.0.tar.gz", hash = "sha256:b129d091a8360b4152ab65327186bf4e250de827c4a9b7ddf40a72b1acf1f3c1"}, {file = "botocore-1.37.5.tar.gz", hash = "sha256:f8f526d33ae74d242c577e0440b57b9ec7d53edd41db211155ec8087fe7a5a21"},
] ]
[package.dependencies] [package.dependencies]
@@ -781,14 +781,14 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
[[package]] [[package]]
name = "google-api-python-client" name = "google-api-python-client"
version = "2.161.0" version = "2.162.0"
description = "Google API Client Library for Python" description = "Google API Client Library for Python"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "google_api_python_client-2.161.0-py2.py3-none-any.whl", hash = "sha256:9476a5a4f200bae368140453df40f9cda36be53fa7d0e9a9aac4cdb859a26448"}, {file = "google_api_python_client-2.162.0-py2.py3-none-any.whl", hash = "sha256:49365fa4f7795fe81a747f5544d6528ea94314fa59664e0ea1005f603facf1ec"},
{file = "google_api_python_client-2.161.0.tar.gz", hash = "sha256:324c0cce73e9ea0a0d2afd5937e01b7c2d6a4d7e2579cdb6c384f9699d6c9f37"}, {file = "google_api_python_client-2.162.0.tar.gz", hash = "sha256:5f8bc934a5b6eea73a7d12d999e6585c1823179f48340234acb385e2502e735a"},
] ]
[package.dependencies] [package.dependencies]
@@ -860,14 +860,14 @@ tool = ["click (>=6.0.0)"]
[[package]] [[package]]
name = "googleapis-common-protos" name = "googleapis-common-protos"
version = "1.68.0" version = "1.69.0"
description = "Common protobufs used in Google APIs" description = "Common protobufs used in Google APIs"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"}, {file = "googleapis_common_protos-1.69.0-py2.py3-none-any.whl", hash = "sha256:17835fdc4fa8da1d61cfe2d4d5d57becf7c61d4112f8d81c67eaa9d7ce43042d"},
{file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"}, {file = "googleapis_common_protos-1.69.0.tar.gz", hash = "sha256:5a46d58af72846f59009b9c4710425b9af2139555c71837081706b213b298187"},
] ]
[package.dependencies] [package.dependencies]
@@ -878,14 +878,14 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"]
[[package]] [[package]]
name = "gspread" name = "gspread"
version = "6.1.4" version = "6.2.0"
description = "Google Spreadsheets Python API" description = "Google Spreadsheets Python API"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "gspread-6.1.4-py3-none-any.whl", hash = "sha256:c34781c426031a243ad154952b16f21ac56a5af90687885fbee3d1fba5280dcd"}, {file = "gspread-6.2.0-py3-none-any.whl", hash = "sha256:7fa1a11e1ecacc6c5946fa016be05941baca8540404314f59aec963dd8ae5db3"},
{file = "gspread-6.1.4.tar.gz", hash = "sha256:b8eec27de7cadb338bb1b9f14a9be168372dee8965c0da32121816b5050ac1de"}, {file = "gspread-6.2.0.tar.gz", hash = "sha256:bc3d02d1c39e0b40bfc8035b4fec407aa71a17f343fc81cc7e3f75bfa6555de6"},
] ]
[package.dependencies] [package.dependencies]
@@ -1777,14 +1777,14 @@ files = [
[[package]] [[package]]
name = "pytest" name = "pytest"
version = "8.3.4" version = "8.3.5"
description = "pytest: simple powerful testing with Python" description = "pytest: simple powerful testing with Python"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["dev"] groups = ["dev"]
files = [ files = [
{file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, {file = "pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820"},
{file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, {file = "pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845"},
] ]
[package.dependencies] [package.dependencies]
@@ -2248,14 +2248,14 @@ files = [
[[package]] [[package]]
name = "s3transfer" name = "s3transfer"
version = "0.11.2" version = "0.11.3"
description = "An Amazon S3 Transfer Manager" description = "An Amazon S3 Transfer Manager"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
groups = ["main"] groups = ["main"]
files = [ files = [
{file = "s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc"}, {file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"},
{file = "s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f"}, {file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"},
] ]
[package.dependencies] [package.dependencies]

View File

@@ -1,7 +1,7 @@
{ {
"name": "Atlos Feeder Database", "name": "Atlos Feeder Database Storage",
"type": ["feeder", "database"], "type": ["feeder", "database", "storage"],
"entry_point": "atlos_feeder_db::AtlosFeederDb", "entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
"requires_setup": True, "requires_setup": True,
"dependencies": { "dependencies": {
"python": ["loguru", "requests"], "python": ["loguru", "requests"],
@@ -19,11 +19,9 @@
}, },
}, },
"description": """ "description": """
AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival, AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
along with a database option to output archival results. along with a database option to output archival results.
Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
### Features ### Features
- Connects to the Atlos API to retrieve a list of source material URLs. - Connects to the Atlos API to retrieve a list of source material URLs.
- Filters source materials based on visibility, processing status, and metadata. - Filters source materials based on visibility, processing status, and metadata.
@@ -33,6 +31,7 @@
- Updates failure status with error details when archiving fails. - Updates failure status with error details when archiving fails.
- Processes and formats metadata, including ISO formatting for datetime fields. - Processes and formats metadata, including ISO formatting for datetime fields.
- Skips processing for items without an Atlos ID. - Skips processing for items without an Atlos ID.
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
### Notes ### Notes
- Requires an Atlos API endpoint and a valid API token for authentication. - Requires an Atlos API endpoint and a valid API token for authentication.

View File

@@ -1,15 +1,19 @@
import requests import hashlib
import os
from typing import IO, Optional
from typing import Union from typing import Union
import requests
from loguru import logger from loguru import logger
from auto_archiver.core import Database from auto_archiver.core import Database
from auto_archiver.core import Feeder from auto_archiver.core import Feeder
from auto_archiver.core import Media
from auto_archiver.core import Metadata from auto_archiver.core import Metadata
from auto_archiver.core import Storage
class AtlosFeederDb(Feeder, Database): class AtlosFeederDbStorage(Feeder, Database, Storage):
def __iter__(self) -> Metadata: def __iter__(self) -> Metadata:
# Get all the urls from the Atlos API # Get all the urls from the Atlos API
@@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database):
logger.info( logger.info(
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos" f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
) )
def get_cdn_url(self, _media: Media) -> str:
# It's not always possible to provide an exact URL, because it's
# possible that the media once uploaded could have been copied to
# another project.
return self.atlos_url
def _hash(self, media: Media) -> str:
# Hash the media file using sha-256. We don't use the existing auto archiver
# hash because there's no guarantee that the configuerer is using sha-256, which
# is how Atlos hashes files.
sha256 = hashlib.sha256()
with open(media.filename, "rb") as f:
while True:
buf = f.read(4096)
if not buf: break
sha256.update(buf)
return sha256.hexdigest()
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
atlos_id = metadata.get("atlos_id")
if atlos_id is None:
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
return False
media_hash = self._hash(media)
# media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
# Check whether the media has already been uploaded
source_material = requests.get(
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
headers={"Authorization": f"Bearer {self.api_token}"},
).json()["result"]
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
if media_hash in existing_media:
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
return True
# Upload the media to the Atlos API
requests.post(
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
headers={"Authorization": f"Bearer {self.api_token}"},
params={
"title": media.properties
},
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
).raise_for_status()
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
pass