mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 20:58:29 +03:00
Initial Atlos merge
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "Atlos Feeder Database",
|
||||
"type": ["feeder", "database"],
|
||||
"entry_point": "atlos_feeder_db::AtlosFeederDb",
|
||||
"name": "Atlos Feeder Database Storage",
|
||||
"type": ["feeder", "database", "storage"],
|
||||
"entry_point": "atlos_feeder_db_storage::AtlosFeederDbStorage",
|
||||
"requires_setup": True,
|
||||
"dependencies": {
|
||||
"python": ["loguru", "requests"],
|
||||
@@ -19,11 +19,9 @@
|
||||
},
|
||||
},
|
||||
"description": """
|
||||
AtlosFeederDb: A feeder module that integrates with the Atlos API to fetch source material URLs for archival,
|
||||
AtlosFeederDbStorage: A module that integrates with the Atlos API to fetch source material URLs for archival, uplaod extracted media,
|
||||
along with a database option to output archival results.
|
||||
|
||||
Feeder: A feeder module that integrates with the Atlos API to fetch source material URLs for archival.
|
||||
|
||||
### Features
|
||||
- Connects to the Atlos API to retrieve a list of source material URLs.
|
||||
- Filters source materials based on visibility, processing status, and metadata.
|
||||
@@ -33,6 +31,7 @@
|
||||
- Updates failure status with error details when archiving fails.
|
||||
- Processes and formats metadata, including ISO formatting for datetime fields.
|
||||
- Skips processing for items without an Atlos ID.
|
||||
- Saves media files to Atlos, organizing them into folders based on the provided path structure.
|
||||
|
||||
### Notes
|
||||
- Requires an Atlos API endpoint and a valid API token for authentication.
|
||||
@@ -1,15 +1,19 @@
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from typing import IO, Optional
|
||||
from typing import Union
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.core import Storage
|
||||
|
||||
|
||||
class AtlosFeederDb(Feeder, Database):
|
||||
class AtlosFeederDbStorage(Feeder, Database, Storage):
|
||||
|
||||
def __iter__(self) -> Metadata:
|
||||
# Get all the urls from the Atlos API
|
||||
@@ -98,3 +102,59 @@ class AtlosFeederDb(Feeder, Database):
|
||||
logger.info(
|
||||
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
|
||||
)
|
||||
|
||||
def get_cdn_url(self, _media: Media) -> str:
|
||||
# It's not always possible to provide an exact URL, because it's
|
||||
# possible that the media once uploaded could have been copied to
|
||||
# another project.
|
||||
return self.atlos_url
|
||||
|
||||
def _hash(self, media: Media) -> str:
|
||||
# Hash the media file using sha-256. We don't use the existing auto archiver
|
||||
# hash because there's no guarantee that the configuerer is using sha-256, which
|
||||
# is how Atlos hashes files.
|
||||
|
||||
sha256 = hashlib.sha256()
|
||||
with open(media.filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(4096)
|
||||
if not buf: break
|
||||
sha256.update(buf)
|
||||
return sha256.hexdigest()
|
||||
|
||||
def upload(self, media: Media, metadata: Optional[Metadata] = None, **_kwargs) -> bool:
|
||||
atlos_id = metadata.get("atlos_id")
|
||||
if atlos_id is None:
|
||||
logger.error(f"No Atlos ID found in metadata; can't store {media.filename} on Atlos")
|
||||
return False
|
||||
|
||||
media_hash = self._hash(media)
|
||||
# media_hash = calculate_file_hash(media.filename, hash_algo=hashlib.sha256, chunksize=4096)
|
||||
|
||||
# Check whether the media has already been uploaded
|
||||
source_material = requests.get(
|
||||
f"{self.atlos_url}/api/v2/source_material/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
).json()["result"]
|
||||
existing_media = [x["file_hash_sha256"] for x in source_material.get("artifacts", [])]
|
||||
if media_hash in existing_media:
|
||||
logger.info(f"{media.filename} with SHA256 {media_hash} already uploaded to Atlos")
|
||||
return True
|
||||
|
||||
# Upload the media to the Atlos API
|
||||
requests.post(
|
||||
f"{self.atlos_url}/api/v2/source_material/upload/{atlos_id}",
|
||||
headers={"Authorization": f"Bearer {self.api_token}"},
|
||||
params={
|
||||
"title": media.properties
|
||||
},
|
||||
files={"file": (os.path.basename(media.filename), open(media.filename, "rb"))},
|
||||
).raise_for_status()
|
||||
|
||||
logger.info(f"Uploaded {media.filename} to Atlos with ID {atlos_id} and title {media.key}")
|
||||
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool:
|
||||
pass
|
||||
Reference in New Issue
Block a user