Manifests for databases

2026-06-11 20:58:29 +03:00 · 2025-01-22 18:18:13 +00:00
parent 54995ad6ab
commit 99c8c69085
22 changed files with 456 additions and 11 deletions
--- a/src/auto_archiver/databases/init.py
+++ b/src/auto_archiver/databases/init.py
@@ -3,8 +3,8 @@

 """
 from .database import Database
-from .gsheet_db import GsheetsDb
-from .console_db import ConsoleDb
-from .csv_db import CSVDb
-from .api_db import AAApiDb
-from .atlos_db import AtlosDb
+from .gsheet_db.gsheet_db import GsheetsDb
+from .console_db.console_db import ConsoleDb
+from .csv_db.csv_db import CSVDb
+from .api_db.api_db import AAApiDb
+from .atlos_db.atlos_db import AtlosDb
--- a/src/auto_archiver/databases/api_db/init.py
+++ b/src/auto_archiver/databases/api_db/init.py
--- a/src/auto_archiver/databases/api_db/manifest.py
+++ b/src/auto_archiver/databases/api_db/manifest.py
@@ -0,0 +1,33 @@
+{
+    "name": "Auto-Archiver API Database",
+    "type": ["database"],
+    "entry_point": "api_db:AAApiDb",
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": ["requests",
+                   "loguru"],
+    },
+    "configs": {
+            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
+            "api_token": {"default": None, "help": "API Bearer token."},
+            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
+            "author_id": {"default": None, "help": "which email to assign as author"},
+            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
+            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
+            "store_results": {"default": True, "help": "when set, will send the results to the API database."},
+            "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
+        },
+    "description": """
+     Provides integration with the Auto-Archiver API for querying and storing archival data.
+
+### Features
+- **API Integration**: Supports querying for existing archives and submitting results.
+- **Duplicate Prevention**: Avoids redundant archiving when `allow_rearchive` is disabled.
+- **Configurable**: Supports settings like API endpoint, authentication token, tags, and permissions.
+- **Tagging and Metadata**: Adds tags and manages metadata for archives.
+- **Optional Storage**: Archives results conditionally based on configuration.
+
+### Setup
+Requires access to an Auto-Archiver API instance and a valid API token.
+     """,
+}
--- a/src/auto_archiver/databases/api_db/api_db.py
+++ b/src/auto_archiver/databases/api_db/api_db.py
@@ -0,0 +1,70 @@
+from typing import Union
+import requests, os
+from loguru import logger
+
+from .. import Database
+from ...core import Metadata
+
+
+class AAApiDb(Database):
+    """
+        Connects to auto-archiver-api instance
+    """
+    name = "auto_archiver_api_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        self.allow_rearchive = bool(self.allow_rearchive)
+        self.store_results = bool(self.store_results)
+        self.assert_valid_string("api_endpoint")
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "api_endpoint": {"default": None, "help": "API endpoint where calls are made to"},
+            "api_token": {"default": None, "help": "API Bearer token."},
+            "public": {"default": False, "help": "whether the URL should be publicly available via the API"},
+            "author_id": {"default": None, "help": "which email to assign as author"},
+            "group_id": {"default": None, "help": "which group of users have access to the archive in case public=false as author"},
+            "allow_rearchive": {"default": True, "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived"},
+            "store_results": {"default": True, "help": "when set, will send the results to the API database."},
+            "tags": {"default": [], "help": "what tags to add to the archived URL", "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))},
+        }
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """ query the database for the existence of this item.
+            Helps avoid re-archiving the same URL multiple times.
+        """
+        if not self.allow_rearchive: return
+        
+        params = {"url": item.get_url(), "limit": 15}
+        headers = {"Authorization": f"Bearer {self.api_token}", "accept": "application/json"}
+        response = requests.get(os.path.join(self.api_endpoint, "tasks/search-url"), params=params, headers=headers)
+
+        if response.status_code == 200:
+            if len(response.json()):
+                logger.success(f"API returned {len(response.json())} previously archived instance(s)")
+                fetched_metadata = [Metadata.from_dict(r["result"]) for r in response.json()]
+                return Metadata.choose_most_complete(fetched_metadata)
+        else:
+            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
+        return False
+
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
+        """archival result ready - should be saved to DB"""
+        if not self.store_results: return
+        if cached: 
+            logger.debug(f"skipping saving archive of {item.get_url()} to the AA API because it was cached")
+            return
+        logger.debug(f"saving archive of {item.get_url()} to the AA API.")
+
+        payload = {'result': item.to_json(), 'public': self.public, 'author_id': self.author_id, 'group_id': self.group_id, 'tags': list(self.tags)}
+        headers = {"Authorization": f"Bearer {self.api_token}"}
+        response = requests.post(os.path.join(self.api_endpoint, "submit-archive"), json=payload, headers=headers)
+
+        if response.status_code == 200:
+            logger.success(f"AA API: {response.json()}")
+        else:
+            logger.error(f"AA API FAIL ({response.status_code}): {response.json()}")
+
--- a/src/auto_archiver/databases/atlos_db/init.py
+++ b/src/auto_archiver/databases/atlos_db/init.py
--- a/src/auto_archiver/databases/atlos_db/manifest.py
+++ b/src/auto_archiver/databases/atlos_db/manifest.py
@@ -0,0 +1,26 @@
+{
+    "name": "Atlos Database",
+    "type": ["database"],
+    "entry_point": "atlos_db:AtlosDb",
+    "requires_setup": True,
+    "external_dependencies":
+        {"python": ["loguru",
+                    ""],
+         "bin": [""]},
+    "configs": {},
+    "description": """
+Handles integration with the Atlos platform for managing archival results.
+
+### Features
+- Outputs archival results to the Atlos API for storage and tracking.
+- Updates failure status with error details when archiving fails.
+- Processes and formats metadata, including ISO formatting for datetime fields.
+- Skips processing for items without an Atlos ID.
+
+### Setup
+Required configs:
+- atlos_url: Base URL for the Atlos API.
+- api_token: Authentication token for API access.
+"""
+,
+}
--- a/src/auto_archiver/databases/atlos_db/atlos_db.py
+++ b/src/auto_archiver/databases/atlos_db/atlos_db.py
@@ -0,0 +1,79 @@
+import os
+from typing import Union
+from loguru import logger
+from csv import DictWriter
+from dataclasses import asdict
+import requests
+
+from .. import Database
+from ...core import Metadata
+from ...utils import get_atlos_config_options
+
+
+class AtlosDb(Database):
+    """
+    Outputs results to Atlos
+    """
+
+    name = "atlos_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return get_atlos_config_options()
+
+    def failed(self, item: Metadata, reason: str) -> None:
+        """Update DB accordingly for failure"""
+        # If the item has no Atlos ID, there's nothing for us to do
+        if not item.metadata.get("atlos_id"):
+            logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+            return
+
+        requests.post(
+            f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            json={"metadata": {"processed": True, "status": "error", "error": reason}},
+        ).raise_for_status()
+        logger.info(
+            f"Stored failure for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos: {reason}"
+        )
+
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """check and fetch if the given item has been archived already, each
+        database should handle its own caching, and configuration mechanisms"""
+        return False
+
+    def _process_metadata(self, item: Metadata) -> dict:
+        """Process metadata for storage on Atlos. Will convert any datetime
+        objects to ISO format."""
+
+        return {
+            k: v.isoformat() if hasattr(v, "isoformat") else v
+            for k, v in item.metadata.items()
+        }
+
+    def done(self, item: Metadata, cached: bool = False) -> None:
+        """archival result ready - should be saved to DB"""
+
+        if not item.metadata.get("atlos_id"):
+            logger.info(f"Item {item.get_url()} has no Atlos ID, skipping")
+            return
+
+        requests.post(
+            f"{self.atlos_url}/api/v2/source_material/metadata/{item.metadata['atlos_id']}/auto_archiver",
+            headers={"Authorization": f"Bearer {self.api_token}"},
+            json={
+                "metadata": dict(
+                    processed=True,
+                    status="success",
+                    results=self._process_metadata(item),
+                )
+            },
+        ).raise_for_status()
+
+        logger.info(
+            f"Stored success for {item.get_url()} (ID {item.metadata['atlos_id']}) on Atlos"
+        )
--- a/src/auto_archiver/databases/console_db/init.py
+++ b/src/auto_archiver/databases/console_db/init.py
--- a/src/auto_archiver/databases/console_db/manifest.py
+++ b/src/auto_archiver/databases/console_db/manifest.py
@@ -0,0 +1,22 @@
+{
+    "name": "Console Database",
+    "type": ["database"],
+    "requires_setup": False,
+    "external_dependencies": {
+        "python": ["loguru"],
+    },
+    "description": """
+Provides a simple database implementation that outputs archival results and status updates to the console.
+
+### Features
+- Logs the status of archival tasks directly to the console, including:
+  - started
+  - failed (with error details)
+  - aborted
+  - done (with optional caching status)
+- Useful for debugging or lightweight setups where no external database is required.
+
+### Setup
+No additional configuration is required.
+""",
+}
--- a/src/auto_archiver/databases/console_db/console_db.py
+++ b/src/auto_archiver/databases/console_db/console_db.py
@@ -0,0 +1,32 @@
+from loguru import logger
+
+from .. import Database
+from ...core import Metadata
+
+
+class ConsoleDb(Database):
+    """
+        Outputs results to the console
+    """
+    name = "console_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {}
+
+    def started(self, item: Metadata) -> None:
+        logger.warning(f"STARTED {item}")
+
+    def failed(self, item: Metadata, reason:str) -> None:
+        logger.error(f"FAILED {item}: {reason}")
+
+    def aborted(self, item: Metadata) -> None:
+        logger.warning(f"ABORTED {item}")
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
+        """archival result ready - should be saved to DB"""
+        logger.success(f"DONE {item}")
--- a/src/auto_archiver/databases/csv_db/init.py
+++ b/src/auto_archiver/databases/csv_db/init.py
--- a/src/auto_archiver/databases/csv_db/manifest.py
+++ b/src/auto_archiver/databases/csv_db/manifest.py
@@ -0,0 +1,22 @@
+{
+    "name": "csv_db",
+    "type": ["database"],
+    "requires_setup": False,
+    "external_dependencies": {"python": ["loguru"]
+                              },
+    "configs": {
+            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+        },
+    "description": """
+Handles exporting archival results to a CSV file.
+
+### Features
+- Saves archival metadata as rows in a CSV file.
+- Automatically creates the CSV file with a header if it does not exist.
+- Appends new metadata entries to the existing file.
+
+### Setup
+Required config:
+- csv_file: Path to the CSV file where results will be stored (default: "db.csv").
+""",
+}
--- a/src/auto_archiver/databases/csv_db/csv_db.py
+++ b/src/auto_archiver/databases/csv_db/csv_db.py
@@ -0,0 +1,34 @@
+import os
+from loguru import logger
+from csv import DictWriter
+from dataclasses import asdict
+
+from .. import Database
+from ...core import Metadata
+
+
+class CSVDb(Database):
+    """
+        Outputs results to a CSV file
+    """
+    name = "csv_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        self.assert_valid_string("csv_file")
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "csv_file": {"default": "db.csv", "help": "CSV file name"}
+        }
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
+        """archival result ready - should be saved to DB"""
+        logger.success(f"DONE {item}")
+        is_empty = not os.path.isfile(self.csv_file) or os.path.getsize(self.csv_file) == 0
+        with open(self.csv_file, "a", encoding="utf-8") as outf:
+            writer = DictWriter(outf, fieldnames=asdict(Metadata()))
+            if is_empty: writer.writeheader()
+            writer.writerow(asdict(item))
--- a/src/auto_archiver/databases/gsheet_db/init.py
+++ b/src/auto_archiver/databases/gsheet_db/init.py
--- a/src/auto_archiver/databases/gsheet_db/manifest.py
+++ b/src/auto_archiver/databases/gsheet_db/manifest.py
@@ -0,0 +1,21 @@
+# TODO merge with feeder manifest?
+{
+    "name": "gsheet_db",
+    "type": ["database"],
+    "requires_setup": True,
+    "external_dependencies": {"python": [" loguru"],
+                              },
+    "description": """
+Handles integration with Google Sheets for tracking archival tasks.
+
+### Features
+- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
+- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
+- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
+- Skips redundant updates for empty or invalid data fields.
+
+### Notes
+- Currently works only with metadata provided by GsheetFeeder. 
+- Requires configuration of a linked Google Sheet and appropriate API credentials.
+""",
+}
--- a/src/auto_archiver/databases/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db/gsheet_db.py
@@ -0,0 +1,112 @@
+from typing import Union, Tuple
+import datetime
+from urllib.parse import quote
+
+from loguru import logger
+
+from .. import Database
+from ...core import Metadata, Media, ArchivingContext
+from ...utils import GWorksheet
+
+
+class GsheetsDb(Database):
+    """
+        NB: only works if GsheetFeeder is used. 
+        could be updated in the future to support non-GsheetFeeder metadata 
+    """
+    name = "gsheet_db"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+
+    @staticmethod
+    def configs() -> dict:
+        return {}
+
+    def started(self, item: Metadata) -> None:
+        logger.warning(f"STARTED {item}")
+        gw, row = self._retrieve_gsheet(item)
+        gw.set_cell(row, 'status', 'Archive in progress')
+
+    def failed(self, item: Metadata, reason:str) -> None:
+        logger.error(f"FAILED {item}")
+        self._safe_status_update(item, f'Archive failed {reason}')
+
+    def aborted(self, item: Metadata) -> None:
+        logger.warning(f"ABORTED {item}")
+        self._safe_status_update(item, '')
+
+    def fetch(self, item: Metadata) -> Union[Metadata, bool]:
+        """check if the given item has been archived already"""
+        return False
+
+    def done(self, item: Metadata, cached: bool=False) -> None:
+        """archival result ready - should be saved to DB"""
+        logger.success(f"DONE {item.get_url()}")
+        gw, row = self._retrieve_gsheet(item)
+        # self._safe_status_update(item, 'done')
+
+        cell_updates = []
+        row_values = gw.get_row(row)
+
+        def batch_if_valid(col, val, final_value=None):
+            final_value = final_value or val
+            try:
+                if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
+                    cell_updates.append((row, col, final_value))
+            except Exception as e:
+                logger.error(f"Unable to batch {col}={final_value} due to {e}")
+        status_message = item.status
+        if cached:
+            status_message = f"[cached] {status_message}"
+        cell_updates.append((row, 'status', status_message))
+
+        media: Media = item.get_final_media()
+        if hasattr(media, "urls"):
+            batch_if_valid('archive', "\n".join(media.urls))
+        batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
+        batch_if_valid('title', item.get_title())
+        batch_if_valid('text', item.get("content", ""))
+        batch_if_valid('timestamp', item.get_timestamp())
+        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
+
+        # merge all pdq hashes into a single string, if present
+        pdq_hashes = []
+        all_media = item.get_all_media()
+        for m in all_media:
+            if pdq := m.get("pdq_hash"):
+                pdq_hashes.append(pdq)
+        if len(pdq_hashes):
+            batch_if_valid('pdq_hash', ",".join(pdq_hashes))
+
+        if (screenshot := item.get_media_by_id("screenshot")) and hasattr(screenshot, "urls"):
+            batch_if_valid('screenshot', "\n".join(screenshot.urls))
+
+        if (thumbnail := item.get_first_image("thumbnail")):
+            if hasattr(thumbnail, "urls"):
+                batch_if_valid('thumbnail', f'=IMAGE("{thumbnail.urls[0]}")')
+
+        if (browsertrix := item.get_media_by_id("browsertrix")):
+            batch_if_valid('wacz', "\n".join(browsertrix.urls))
+            batch_if_valid('replaywebpage', "\n".join([f'https://replayweb.page/?source={quote(wacz)}#view=pages&url={quote(item.get_url())}' for wacz in browsertrix.urls]))
+
+        gw.batch_set_cell(cell_updates)
+
+    def _safe_status_update(self, item: Metadata, new_status: str) -> None:
+        try:
+            gw, row = self._retrieve_gsheet(item)
+            gw.set_cell(row, 'status', new_status)
+        except Exception as e:
+            logger.debug(f"Unable to update sheet: {e}")
+
+    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
+        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
+        if gsheet := ArchivingContext.get("gsheet"):
+            gw: GWorksheet = gsheet.get("worksheet")
+            row: int = gsheet.get("row")
+        elif self.sheet_id:
+            print(self.sheet_id)
+
+
+        return gw, row
--- a/src/auto_archiver/modules/instagram_api_archiver/manifest.py
+++ b/src/auto_archiver/modules/instagram_api_archiver/manifest.py
@@ -2,7 +2,6 @@
    "name": "Instagram API Archiver",
    "type": ["extractor"],
    "entry_point": "instagram_api_archiver:InstagramApiArchiver",
-    "depends": ["core"],
    "external_dependencies":
        {"python": ["requests",
                    "loguru",
--- a/src/auto_archiver/modules/instagram_archiver/manifest.py
+++ b/src/auto_archiver/modules/instagram_archiver/manifest.py
@@ -2,7 +2,6 @@
    "name": "Instagram Archiver",
    "type": ["extractor"],
    "entry_point": "instagram_archiver:InstagramArchiver",
-    "depends": ["core"],
    "external_dependencies": {
        "python": ["instaloader",
                   "loguru",],
--- a/src/auto_archiver/modules/instagram_tbot_archiver/manifest.py
+++ b/src/auto_archiver/modules/instagram_tbot_archiver/manifest.py
@@ -2,7 +2,6 @@
    "name": "Instagram Telegram Bot Archiver",
    "type": ["extractor"],
    "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
-    "depends": ["core", "utils"],
    "external_dependencies": {"python": ["loguru",
                                         "telethon",],
                              },
--- a/src/auto_archiver/modules/telegram_archiver/manifest.py
+++ b/src/auto_archiver/modules/telegram_archiver/manifest.py
@@ -3,7 +3,6 @@
    "type": ["extractor"],
    "entry_point": "telegram_archiver:TelegramArchiver",
    "requires_setup": False,
-    "depends": ["core"],
    "external_dependencies": {
        "python": [
            "requests",
--- a/src/auto_archiver/modules/telethon_archiver/manifest.py
+++ b/src/auto_archiver/modules/telethon_archiver/manifest.py
@@ -4,7 +4,6 @@
    "type": ["extractor"],
    "entry_point": "telethon_archiver:TelethonArchiver",
    "requires_setup": True,
-    "depends": [""],
    "external_dependencies": {
        "python": ["telethon",
                   "loguru",
--- a/src/auto_archiver/modules/twitter_api_archiver/manifest.py
+++ b/src/auto_archiver/modules/twitter_api_archiver/manifest.py
@@ -3,7 +3,6 @@
    "type": ["extractor"],
    "entry_point": "twitter_api_archiver:TwitterApiArchiver",
    "requires_setup": True,
-    "depends": ["core"],
    "external_dependencies": {
        "python": ["requests",
                   "loguru",