Merge pull request #71 from bellingcat/update-aa-1.0.0

Update aa 1.0.0
2026-06-14 22:48:35 +03:00 · 2025-04-03 19:48:51 +01:00
parent de6800ea54 65d63b3770
commit 3f0a2fb8de
10 changed files with 748 additions and 674 deletions
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ To properly set up the API you need to install `docker` and to have these files,
 2. a `user-groups.yaml` to manage user permissions
  1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
  2. go through the example file and configure it according to your needs.
-3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
+3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder_db` feeder and database enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.

 Do not commit those files, they are .gitignored by default.
 We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.
--- a/app/shared/constants.py
+++ b/app/shared/constants.py
@@ -2,3 +2,6 @@
 STATUS_FAILURE = "FAILURE"
 STATUS_PENDING = "PENDING"
 STATUS_SUCCESS = "SUCCESS"
+
+# AA CLI CONFIGS
+SHEET_ID = "--gsheet_feeder_db.sheet_id"
--- a/app/shared/user_groups.py
+++ b/app/shared/user_groups.py
@@ -94,20 +94,31 @@ class GroupPermissions(BaseModel):

 class GroupModel(BaseModel):
    description: str
-    orchestrator: str
-    orchestrator_sheet: str
+    orchestrator: str | None = None
+    orchestrator_sheet: str | None = None
    permissions: GroupPermissions

    @classmethod
-    @field_validator("orchestrator", "orchestrator_sheet", mode="before")
+    @field_validator("orchestrator", mode="before")
    def validate_orchestrator(cls, v):
-        if not os.path.exists(v):
+        # orchestrator is only needed if the group has archive_url permission
+        if cls.permissions.archive_url and not os.path.exists(v):
+            raise ValueError(f"Orchestrator file not found with this path: {v}")
+        return v
+
+    @classmethod
+    @field_validator("orchestrator_sheet", mode="before")
+    def validate_orchestrator_sheet(cls, v):
+        # orchestrator_sheet is only needed if the group has archive_sheet permission
+        if cls.permissions.archive_sheet and not os.path.exists(v):
            raise ValueError(f"Orchestrator file not found with this path: {v}")
        return v

    @computed_field
    @property
    def service_account_email(self) -> str:
+        if self.orchestrator_sheet is None:
+            return ""
        if hasattr(self, "_service_account_email"):
            return self._service_account_email
        orch = yaml.safe_load(open(self.orchestrator_sheet))
--- a/app/tests/orchestration.test.yaml
+++ b/app/tests/orchestration.test.yaml
@@ -1,7 +1,8 @@
 steps:
-  feeder: cli_feeder
+  feeders:
+    - cli_feeder
  archivers: # order matters
-    - youtubedl_archiver
+    - generic_extractor
  enrichers:
    - hash_enricher

@@ -12,7 +13,7 @@ steps:
    - console_db

 configurations:
-  gsheet_feeder:
+  gsheet_feeder_db:
    service_account: "app/tests/fake_service_account.json"
  cli_feeder:
    urls:
--- a/app/tests/worker/test_worker_main.py
+++ b/app/tests/worker/test_worker_main.py
@@ -4,7 +4,7 @@ from unittest.mock import patch
 import pytest
 from auto_archiver.core import Media, Metadata

-from app.shared import schemas
+from app.shared import constants, schemas
 from app.shared.db import models
 from app.web.utils.misc import get_all_urls
 from app.worker.main import create_archive_task, create_sheet_task
@@ -119,7 +119,7 @@ class TestCreateSheetTask:
        res = create_sheet_task(self.sheet.model_dump_json())

        m_args.assert_called_once_with(
-            "interstellar", True, ["--gsheet_feeder.sheet_id", "123"]
+            "interstellar", True, [constants.SHEET_ID, "123"]
        )
        m_orchestrator.return_value.setup.assert_called_once()
        m_orchestrator.return_value.feed.assert_called_once()
--- a/app/web/config.py
+++ b/app/web/config.py
@@ -1,4 +1,4 @@
-VERSION = "0.9.4"
+VERSION = "0.10.0"

 API_DESCRIPTION = """
 #### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.
--- a/app/web/security.py
+++ b/app/web/security.py
@@ -84,8 +84,9 @@ def authenticate_user(access_token) -> (bool, str):
    if FIREBASE_OAUTH_ENABLED:
        try:
            return firebase_login_attempt(access_token)
-        except exceptions.FirebaseError as e:
-            logger.warning(f"Error verifying ID token: {str(e)[:80]}...")
+        except exceptions.FirebaseError:
+            # used a non-Firebase token, fallback to Google OAuth
+            pass

    # https://cloud.google.com/docs/authentication/token-types#access
    if not isinstance(access_token, str) or len(access_token) < 10:
--- a/app/worker/main.py
+++ b/app/worker/main.py
@@ -7,7 +7,7 @@ from celery.signals import task_failure
 from loguru import logger
 from sqlalchemy import exc

-from app.shared import business_logic, schemas
+from app.shared import business_logic, constants, schemas
 from app.shared.db import models, worker_crud
 from app.shared.db.database import get_db
 from app.shared.log import log_error
@@ -25,10 +25,7 @@ Redis = get_redis()
 USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME

 setup_celery_logger(celery)
-
-# TODO: these are temporary PATCHES for new aa's functionality
-# logger.add("app/worker/worker_log.log", level="DEBUG")
-logger.remove = lambda x: print(f"logger.remove({x})")
+AA_LOGGER_ID = None


 # TODO: after release, as it requires updating past entries with sheet_id where tag
@@ -41,14 +38,19 @@ logger.remove = lambda x: print(f"logger.remove({x})")
    retry_kwargs={"max_retries": 1},
 )
 def create_archive_task(self, archive_json: str):
+    global AA_LOGGER_ID
    archive = schemas.ArchiveCreate.model_validate_json(archive_json)

    # call auto-archiver
    args = get_orchestrator_args(archive.group_id, False, [archive.url])
+    result = None
    try:
        orchestrator = ArchivingOrchestrator()
+        orchestrator.logger_id = AA_LOGGER_ID  # ensure single logger
        orchestrator.setup(args)
-        result = next(orchestrator.feed())
+        AA_LOGGER_ID = orchestrator.logger_id
+        for orch_res in orchestrator.feed():
+            result = orch_res
    except SystemExit as e:
        log_error(e, "create_archive_task: SystemExit from AA")
    except Exception as e:
@@ -68,6 +70,7 @@ def create_archive_task(self, archive_json: str):

@celery.task(name="create_sheet_task", bind=True)
 def create_sheet_task(self, sheet_json: str):
+    global AA_LOGGER_ID
    sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
    queue_name = (create_sheet_task.request.delivery_info or {}).get(
        "routing_key", "unknown"
@@ -75,10 +78,12 @@ def create_sheet_task(self, sheet_json: str):
    logger.info(f"[queue={queue_name}] SHEET START {sheet=}")

    args = get_orchestrator_args(
-        sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id]
+        sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
    )
    orchestrator = ArchivingOrchestrator()
+    orchestrator.logger_id = AA_LOGGER_ID  # ensure single logger
    orchestrator.setup(args)
+    AA_LOGGER_ID = orchestrator.logger_id

    stats = {"archived": 0, "failed": 0, "errors": []}
    try:
@@ -128,8 +133,7 @@ def create_sheet_task(self, sheet_json: str):
 def get_orchestrator_args(
    group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
 ) -> list:
-    if cli_args is None:
-        cli_args = []
+    cli_args.append("--logging.enabled=false")

    aa_configs = []
    with get_db() as session:
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,6 @@ requires-python = ">=3.10,<3.13"

 dependencies = [
    "auto-archiver (>=0.13.1)",
-    "oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
    "celery (>=5.0)",
    "redis (==3.5.3)",
    "loguru (>=0.7.3,<0.8.0)",