mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Set up feeder manifests (not merged by source yet)
This commit is contained in:
0
src/auto_archiver/modules/api_db/__init__.py
Normal file
0
src/auto_archiver/modules/api_db/__init__.py
Normal file
33
src/auto_archiver/modules/api_db/__manifest__.py
Normal file
33
src/auto_archiver/modules/api_db/__manifest__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "Auto-Archiver API Database",
|
||||
"type": ["database"],
|
||||
"entry_point": "api_db:AAApiDb",
|
||||
"requires_setup": True,
|
||||
"external_dependencies": {
|
||||
"python": ["requests",
|
||||
"loguru"],
|
||||
},
|
||||
"configs": {
|
||||
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
|
||||
"api_token": {"default": None, "help": "API Bearer token."},
|
||||
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
|
||||
"author_id": {"default": None, "help": "which email to assign as author"},
|
||||
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
|
||||
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
|
||||
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
|
||||
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
|
||||
},
|
||||
"description": """
|
||||
Provides integration with the Auto-Archiver API for querying and storing archival data.
|
||||
|
||||
### Features
|
||||
- **API Integration**: Supports querying for existing archives and submitting results.
|
||||
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
|
||||
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
|
||||
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
|
||||
- **Optional Storage**: Archives results conditionally based on configuration.
|
||||
|
||||
### Setup
|
||||
Requires access to an Auto-Archiver API instance and a valid API token.
|
||||
""",
|
||||
}
|
||||
59
src/auto_archiver/modules/api_db/api_db.py
Normal file
59
src/auto_archiver/modules/api_db/api_db.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import Union
|
||||
import requests, os
|
||||
from loguru import logger
|
||||
|
||||
from auto_archiver.databases import Database
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class AAApiDb(Database):
|
||||
"""
|
||||
Connects to auto-archiver-api instance
|
||||
"""
|
||||
name = "auto_archiver_api_db"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
self.allow_rearchive = bool(self.allow_rearchive)
|
||||
self.store_results = bool(self.store_results)
|
||||
self.assert_valid_string("api_endpoint")
|
||||
|
||||
|
||||
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
|
||||
""" query the database for the existence of this item.
|
||||
Helps avoid re-archiving the same URL multiple times.
|
||||
"""
|
||||
if not self.allow_rearchive: return
|
||||
|
||||
params = {"url": item.get_url(), "limit": 15}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
|
||||
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
if len(response.json()):
|
||||
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
|
||||
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
|
||||
return Metadata.choose_most_complete(fetched_metadata)
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
return False
|
||||
|
||||
|
||||
def done(self, item: Metadata, cached: bool=False) -> None:
|
||||
"""archival result ready - should be saved to DB"""
|
||||
if not self.store_results: return
|
||||
if cached:
|
||||
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
|
||||
return
|
||||
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
|
||||
|
||||
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
|
||||
headers = {"Authorization": f"Bearer {self.api_token}"}
|
||||
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
logger.success(f"AA API: {response.json()}")
|
||||
else:
|
||||
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
|
||||
|
||||
Reference in New Issue
Block a user