Merge pull request #71 from bellingcat/update-aa-1.0.0

Update aa 1.0.0
This commit is contained in:
Miguel Sozinho Ramalho
2025-04-03 19:48:51 +01:00
committed by GitHub
10 changed files with 748 additions and 674 deletions

View File

@@ -12,7 +12,7 @@ To properly set up the API you need to install `docker` and to have these files,
2. a `user-groups.yaml` to manage user permissions 2. a `user-groups.yaml` to manage user permissions
1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`. 1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
2. go through the example file and configure it according to your needs. 2. go through the example file and configure it according to your needs.
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up. 3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder_db` feeder and database enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
Do not commit those files, they are .gitignored by default. Do not commit those files, they are .gitignored by default.
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored. We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.

View File

@@ -2,3 +2,6 @@
STATUS_FAILURE = "FAILURE" STATUS_FAILURE = "FAILURE"
STATUS_PENDING = "PENDING" STATUS_PENDING = "PENDING"
STATUS_SUCCESS = "SUCCESS" STATUS_SUCCESS = "SUCCESS"
# AA CLI CONFIGS
SHEET_ID = "--gsheet_feeder_db.sheet_id"

View File

@@ -94,20 +94,31 @@ class GroupPermissions(BaseModel):
class GroupModel(BaseModel): class GroupModel(BaseModel):
description: str description: str
orchestrator: str orchestrator: str | None = None
orchestrator_sheet: str orchestrator_sheet: str | None = None
permissions: GroupPermissions permissions: GroupPermissions
@classmethod @classmethod
@field_validator("orchestrator", "orchestrator_sheet", mode="before") @field_validator("orchestrator", mode="before")
def validate_orchestrator(cls, v): def validate_orchestrator(cls, v):
if not os.path.exists(v): # orchestrator is only needed if the group has archive_url permission
if cls.permissions.archive_url and not os.path.exists(v):
raise ValueError(f"Orchestrator file not found with this path: {v}")
return v
@classmethod
@field_validator("orchestrator_sheet", mode="before")
def validate_orchestrator_sheet(cls, v):
# orchestrator_sheet is only needed if the group has archive_sheet permission
if cls.permissions.archive_sheet and not os.path.exists(v):
raise ValueError(f"Orchestrator file not found with this path: {v}") raise ValueError(f"Orchestrator file not found with this path: {v}")
return v return v
@computed_field @computed_field
@property @property
def service_account_email(self) -> str: def service_account_email(self) -> str:
if self.orchestrator_sheet is None:
return ""
if hasattr(self, "_service_account_email"): if hasattr(self, "_service_account_email"):
return self._service_account_email return self._service_account_email
orch = yaml.safe_load(open(self.orchestrator_sheet)) orch = yaml.safe_load(open(self.orchestrator_sheet))

View File

@@ -1,7 +1,8 @@
steps: steps:
feeder: cli_feeder feeders:
- cli_feeder
archivers: # order matters archivers: # order matters
- youtubedl_archiver - generic_extractor
enrichers: enrichers:
- hash_enricher - hash_enricher
@@ -12,7 +13,7 @@ steps:
- console_db - console_db
configurations: configurations:
gsheet_feeder: gsheet_feeder_db:
service_account: "app/tests/fake_service_account.json" service_account: "app/tests/fake_service_account.json"
cli_feeder: cli_feeder:
urls: urls:

View File

@@ -4,7 +4,7 @@ from unittest.mock import patch
import pytest import pytest
from auto_archiver.core import Media, Metadata from auto_archiver.core import Media, Metadata
from app.shared import schemas from app.shared import constants, schemas
from app.shared.db import models from app.shared.db import models
from app.web.utils.misc import get_all_urls from app.web.utils.misc import get_all_urls
from app.worker.main import create_archive_task, create_sheet_task from app.worker.main import create_archive_task, create_sheet_task
@@ -119,7 +119,7 @@ class TestCreateSheetTask:
res = create_sheet_task(self.sheet.model_dump_json()) res = create_sheet_task(self.sheet.model_dump_json())
m_args.assert_called_once_with( m_args.assert_called_once_with(
"interstellar", True, ["--gsheet_feeder.sheet_id", "123"] "interstellar", True, [constants.SHEET_ID, "123"]
) )
m_orchestrator.return_value.setup.assert_called_once() m_orchestrator.return_value.setup.assert_called_once()
m_orchestrator.return_value.feed.assert_called_once() m_orchestrator.return_value.feed.assert_called_once()

View File

@@ -1,4 +1,4 @@
VERSION = "0.9.4" VERSION = "0.10.0"
API_DESCRIPTION = """ API_DESCRIPTION = """
#### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets. #### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.

View File

@@ -84,8 +84,9 @@ def authenticate_user(access_token) -> (bool, str):
if FIREBASE_OAUTH_ENABLED: if FIREBASE_OAUTH_ENABLED:
try: try:
return firebase_login_attempt(access_token) return firebase_login_attempt(access_token)
except exceptions.FirebaseError as e: except exceptions.FirebaseError:
logger.warning(f"Error verifying ID token: {str(e)[:80]}...") # used a non-Firebase token, fallback to Google OAuth
pass
# https://cloud.google.com/docs/authentication/token-types#access # https://cloud.google.com/docs/authentication/token-types#access
if not isinstance(access_token, str) or len(access_token) < 10: if not isinstance(access_token, str) or len(access_token) < 10:

View File

@@ -7,7 +7,7 @@ from celery.signals import task_failure
from loguru import logger from loguru import logger
from sqlalchemy import exc from sqlalchemy import exc
from app.shared import business_logic, schemas from app.shared import business_logic, constants, schemas
from app.shared.db import models, worker_crud from app.shared.db import models, worker_crud
from app.shared.db.database import get_db from app.shared.db.database import get_db
from app.shared.log import log_error from app.shared.log import log_error
@@ -25,10 +25,7 @@ Redis = get_redis()
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
setup_celery_logger(celery) setup_celery_logger(celery)
AA_LOGGER_ID = None
# TODO: these are temporary PATCHES for new aa's functionality
# logger.add("app/worker/worker_log.log", level="DEBUG")
logger.remove = lambda x: print(f"logger.remove({x})")
# TODO: after release, as it requires updating past entries with sheet_id where tag # TODO: after release, as it requires updating past entries with sheet_id where tag
@@ -41,14 +38,19 @@ logger.remove = lambda x: print(f"logger.remove({x})")
retry_kwargs={"max_retries": 1}, retry_kwargs={"max_retries": 1},
) )
def create_archive_task(self, archive_json: str): def create_archive_task(self, archive_json: str):
global AA_LOGGER_ID
archive = schemas.ArchiveCreate.model_validate_json(archive_json) archive = schemas.ArchiveCreate.model_validate_json(archive_json)
# call auto-archiver # call auto-archiver
args = get_orchestrator_args(archive.group_id, False, [archive.url]) args = get_orchestrator_args(archive.group_id, False, [archive.url])
result = None
try: try:
orchestrator = ArchivingOrchestrator() orchestrator = ArchivingOrchestrator()
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
orchestrator.setup(args) orchestrator.setup(args)
result = next(orchestrator.feed()) AA_LOGGER_ID = orchestrator.logger_id
for orch_res in orchestrator.feed():
result = orch_res
except SystemExit as e: except SystemExit as e:
log_error(e, "create_archive_task: SystemExit from AA") log_error(e, "create_archive_task: SystemExit from AA")
except Exception as e: except Exception as e:
@@ -68,6 +70,7 @@ def create_archive_task(self, archive_json: str):
@celery.task(name="create_sheet_task", bind=True) @celery.task(name="create_sheet_task", bind=True)
def create_sheet_task(self, sheet_json: str): def create_sheet_task(self, sheet_json: str):
global AA_LOGGER_ID
sheet = schemas.SubmitSheet.model_validate_json(sheet_json) sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
queue_name = (create_sheet_task.request.delivery_info or {}).get( queue_name = (create_sheet_task.request.delivery_info or {}).get(
"routing_key", "unknown" "routing_key", "unknown"
@@ -75,10 +78,12 @@ def create_sheet_task(self, sheet_json: str):
logger.info(f"[queue={queue_name}] SHEET START {sheet=}") logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
args = get_orchestrator_args( args = get_orchestrator_args(
sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id] sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
) )
orchestrator = ArchivingOrchestrator() orchestrator = ArchivingOrchestrator()
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
orchestrator.setup(args) orchestrator.setup(args)
AA_LOGGER_ID = orchestrator.logger_id
stats = {"archived": 0, "failed": 0, "errors": []} stats = {"archived": 0, "failed": 0, "errors": []}
try: try:
@@ -128,8 +133,7 @@ def create_sheet_task(self, sheet_json: str):
def get_orchestrator_args( def get_orchestrator_args(
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
) -> list: ) -> list:
if cli_args is None: cli_args.append("--logging.enabled=false")
cli_args = []
aa_configs = [] aa_configs = []
with get_db() as session: with get_db() as session:

1357
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,7 +22,6 @@ requires-python = ">=3.10,<3.13"
dependencies = [ dependencies = [
"auto-archiver (>=0.13.1)", "auto-archiver (>=0.13.1)",
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
"celery (>=5.0)", "celery (>=5.0)",
"redis (==3.5.3)", "redis (==3.5.3)",
"loguru (>=0.7.3,<0.8.0)", "loguru (>=0.7.3,<0.8.0)",