Manifests for databases

This commit is contained in:
erinhmclark
2025-01-22 18:18:13 +00:00
parent 54995ad6ab
commit 99c8c69085
22 changed files with 456 additions and 11 deletions

View File

@@ -3,8 +3,8 @@
"""
from .database import Database
from .gsheet_db import GsheetsDb
from .console_db import ConsoleDb
from .csv_db import CSVDb
from .api_db import AAApiDb
from .atlos_db import AtlosDb
from .gsheet_db.gsheet_db import GsheetsDb
from .console_db.console_db import ConsoleDb
from .csv_db.csv_db import CSVDb
from .api_db.api_db import AAApiDb
from .atlos_db.atlos_db import AtlosDb

View File

@@ -0,0 +1,33 @@
{
"name": "Auto-Archiver API Database",
"type": ["database"],
"entry_point": "api_db:AAApiDb",
"requires_setup": True,
"external_dependencies": {
"python": ["requests",
"loguru"],
},
"configs": {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_token": {"default": None, "help": "API Bearer token."},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
},
"description": """
Provides integration with the Auto-Archiver API for querying and storing archival data.
### Features
- **API Integration**: Supports querying for existing archives and submitting results.
- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
- **Tagging and Metadata**: Adds tags and manages metadata for archives.
- **Optional Storage**: Archives results conditionally based on configuration.
### Setup
Requires access to an Auto-Archiver API instance and a valid API token.
""",
}

View File

@@ -0,0 +1,70 @@
from typing import Union
import requests, os
from loguru import logger
from .. import Database
from ...core import Metadata
class AAApiDb(Database):
"""
Connects to auto-archiver-api instance
"""
name = "auto_archiver_api_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.allow_rearchive = bool(self.allow_rearchive)
self.store_results = bool(self.store_results)
self.assert_valid_string("api_endpoint")
@staticmethod
def configs() -> dict:
return {
"api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
"api_token": {"default": None, "help": "API Bearer token."},
"public": {"default": False, "help": "whether the URL should be publicly available via the API"},
"author_id": {"default": None, "help": "which email to assign as author"},
"group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
"allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
"store_results": {"default": True, "help": "when set, will send the results to the API database."},
"tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
}
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
""" query the database for the existence of this item.
Helps avoid re-archiving the same URL multiple times.
"""
if not self.allow_rearchive: return
params = {"url": item.get_url(), "limit": 15}
headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
if response.status_code == 200:
if len(response.json()):
logger.success(f"API returned {len(response.json())} previously archived instance(s)")
fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
return Metadata.choose_most_complete(fetched_metadata)
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
return False
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB"""
if not self.store_results: return
if cached:
logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
return
logger.debug(f"saving archive of {item.get_url()} to the AA API.")
payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
headers = {"Authorization": f"Bearer {self.api_token}"}
response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
if response.status_code == 200:
logger.success(f"AA API: {response.json()}")
else:
logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")

View File

@@ -0,0 +1,26 @@
{
"name": "Atlos Database",
"type": ["database"],
"entry_point": "atlos_db:AtlosDb",
"requires_setup": True,
"external_dependencies":
{"python": ["loguru",
""],
"bin": [""]},
"configs": {},
"description": """
Handles integration with the Atlos platform for managing archival results.
### Features
- Outputs archival results to the Atlos API for storage and tracking.
- Updates failure status with error details when archiving fails.
- Processes and formats metadata, including ISO formatting for datetime fields.
- Skips processing for items without an Atlos ID.
### Setup
Required configs:
- atlos_url: Base URL for the Atlos API.
- api_token: Authentication token for API access.
"""
,
}

View File

@@ -0,0 +1,79 @@
import os
from typing import Union
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
import requests
from .. import Database
from ...core import Metadata
from ...utils import get_atlos_config_options
class AtlosDb(Database):
"""
Outputs results to Atlos
"""
name = "atlos_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return get_atlos_config_options()
def failed(self, item: Metadata, reason: str) -> None:
"""Update DB accordingly for failure"""
# If the item has no Atlos ID, there's nothing for us to do
if not item.metadata.get("atlos_id"):
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
return
requests.post(
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
headers={"Authorization": f"Bearer {self.api_token}"},
json={"metadata": {"processed": True, "status": "error", "error": reason}},
).raise_for_status()
logger.info(
f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
)
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check and fetch if the given item has been archived already, each
database should handle its own caching, and configuration mechanisms"""
return False
def _process_metadata(self, item: Metadata) -> dict:
"""Process metadata for storage on Atlos. Will convert any datetime
objects to ISO format."""
return {
k: v.isoformat() if hasattr(v, "isoformat") else v
for k, v in item.metadata.items()
}
def done(self, item: Metadata, cached: bool = False) -> None:
"""archival result ready - should be saved to DB"""
if not item.metadata.get("atlos_id"):
logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
return
requests.post(
f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
headers={"Authorization": f"Bearer {self.api_token}"},
json={
"metadata": dict(
processed=True,
status="success",
results=self._process_metadata(item),
)
},
).raise_for_status()
logger.info(
f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
)

View File

@@ -0,0 +1,22 @@
{
"name": "Console Database",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {
"python": ["loguru"],
},
"description": """
Provides a simple database implementation that outputs archival results and status updates to the console.
### Features
- Logs the status of archival tasks directly to the console, including:
- started
- failed (with error details)
- aborted
- done (with optional caching status)
- Useful for debugging or lightweight setups where no external database is required.
### Setup
No additional configuration is required.
""",
}

View File

@@ -0,0 +1,32 @@
from loguru import logger
from .. import Database
from ...core import Metadata
class ConsoleDb(Database):
"""
Outputs results to the console
"""
name = "console_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}: {reason}")
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item}")

View File

@@ -0,0 +1,22 @@
{
"name": "csv_db",
"type": ["database"],
"requires_setup": False,
"external_dependencies": {"python": ["loguru"]
},
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
},
"description": """
Handles exporting archival results to a CSV file.
### Features
- Saves archival metadata as rows in a CSV file.
- Automatically creates the CSV file with a header if it does not exist.
- Appends new metadata entries to the existing file.
### Setup
Required config:
- csv_file: Path to the CSV file where results will be stored (default: "db.csv").
""",
}

View File

@@ -0,0 +1,34 @@
import os
from loguru import logger
from csv import DictWriter
from dataclasses import asdict
from .. import Database
from ...core import Metadata
class CSVDb(Database):
"""
Outputs results to a CSV file
"""
name = "csv_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.assert_valid_string("csv_file")
@staticmethod
def configs() -> dict:
return {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
}
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item}")
is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
with open(self.csv_file, "a", encoding="utf-8") as outf:
writer = DictWriter(outf, fieldnames=asdict(Metadata()))
if is_empty: writer.writeheader()
writer.writerow(asdict(item))

View File

@@ -0,0 +1,21 @@
# TODO merge with feeder manifest?
{
"name": "gsheet_db",
"type": ["database"],
"requires_setup": True,
"external_dependencies": {"python": [" loguru"],
},
"description": """
Handles integration with Google Sheets for tracking archival tasks.
### Features
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
- Skips redundant updates for empty or invalid data fields.
### Notes
- Currently works only with metadata provided by GsheetFeeder.
- Requires configuration of a linked Google Sheet and appropriate API credentials.
""",
}

View File

@@ -0,0 +1,112 @@
from typing import Union, Tuple
import datetime
from urllib.parse import quote
from loguru import logger
from .. import Database
from ...core import Metadata, Media, ArchivingContext
from ...utils import GWorksheet
class GsheetsDb(Database):
"""
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
name = "gsheet_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', 'Archive in progress')
def failed(self, item: Metadata, reason:str) -> None:
logger.error(f"FAILED {item}")
self._safe_status_update(item, f'Archive failed {reason}')
def aborted(self, item: Metadata) -> None:
logger.warning(f"ABORTED {item}")
self._safe_status_update(item, '')
def fetch(self, item: Metadata) -> Union[Metadata, bool]:
"""check if the given item has been archived already"""
return False
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB"""
logger.success(f"DONE {item.get_url()}")
gw, row = self._retrieve_gsheet(item)
# self._safe_status_update(item, 'done')
cell_updates = []
row_values = gw.get_row(row)
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
try:
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
cell_updates.append((row, col, final_value))
except Exception as e:
logger.error(f"Unable to batch {col}={final_value} due to {e}")
status_message = item.status
if cached:
status_message = f"[cached] {status_message}"
cell_updates.append((row, 'status', status_message))
media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present
pdq_hashes = []
all_media = item.get_all_media()
for m in all_media:
if pdq := m.get("pdq_hash"):
pdq_hashes.append(pdq)
if len(pdq_hashes):
batch_if_valid('pdq_hash', ",".join(pdq_hashes))
if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
batch_if_valid('screenshot', "\n".join(screenshot.urls))
if (thumbnail := item.get_first_image("thumbnail")):
if hasattr(thumbnail, "urls"):
batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
if (browsertrix := item.get_media_by_id("browsertrix")):
batch_if_valid('wacz', "\n".join(browsertrix.urls))
batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
gw.batch_set_cell(cell_updates)
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
gw, row = self._retrieve_gsheet(item)
gw.set_cell(row, 'status', new_status)
except Exception as e:
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
if gsheet := ArchivingContext.get("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:
print(self.sheet_id)
return gw, row

View File

@@ -2,7 +2,6 @@
"name": "Instagram API Archiver",
"type": ["extractor"],
"entry_point": "instagram_api_archiver:InstagramApiArchiver",
"depends": ["core"],
"external_dependencies":
{"python": ["requests",
"loguru",

View File

@@ -2,7 +2,6 @@
"name": "Instagram Archiver",
"type": ["extractor"],
"entry_point": "instagram_archiver:InstagramArchiver",
"depends": ["core"],
"external_dependencies": {
"python": ["instaloader",
"loguru",],

View File

@@ -2,7 +2,6 @@
"name": "Instagram Telegram Bot Archiver",
"type": ["extractor"],
"entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
"depends": ["core", "utils"],
"external_dependencies": {"python": ["loguru",
"telethon",],
},

View File

@@ -3,7 +3,6 @@
"type": ["extractor"],
"entry_point": "telegram_archiver:TelegramArchiver",
"requires_setup": False,
"depends": ["core"],
"external_dependencies": {
"python": [
"requests",

View File

@@ -4,7 +4,6 @@
"type": ["extractor"],
"entry_point": "telethon_archiver:TelethonArchiver",
"requires_setup": True,
"depends": [""],
"external_dependencies": {
"python": ["telethon",
"loguru",

View File

@@ -3,7 +3,6 @@
"type": ["extractor"],
"entry_point": "twitter_api_archiver:TwitterApiArchiver",
"requires_setup": True,
"depends": ["core"],
"external_dependencies": {
"python": ["requests",
"loguru",