Merge pull request #71 from bellingcat/update-aa-1.0.0

Update aa 1.0.0
This commit is contained in:
Miguel Sozinho Ramalho
2025-04-03 19:48:51 +01:00
committed by GitHub
10 changed files with 748 additions and 674 deletions

View File

@@ -12,7 +12,7 @@ To properly set up the API you need to install `docker` and to have these files,
2. a `user-groups.yaml` to manage user permissions
1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
2. go through the example file and configure it according to your needs.
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder_db` feeder and database enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
Do not commit those files, they are .gitignored by default.
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.

View File

@@ -2,3 +2,6 @@
STATUS_FAILURE = "FAILURE"
STATUS_PENDING = "PENDING"
STATUS_SUCCESS = "SUCCESS"
# AA CLI CONFIGS
SHEET_ID = "--gsheet_feeder_db.sheet_id"

View File

@@ -94,20 +94,31 @@ class GroupPermissions(BaseModel):
class GroupModel(BaseModel):
description: str
orchestrator: str
orchestrator_sheet: str
orchestrator: str | None = None
orchestrator_sheet: str | None = None
permissions: GroupPermissions
@classmethod
@field_validator("orchestrator", "orchestrator_sheet", mode="before")
@field_validator("orchestrator", mode="before")
def validate_orchestrator(cls, v):
if not os.path.exists(v):
# orchestrator is only needed if the group has archive_url permission
if cls.permissions.archive_url and not os.path.exists(v):
raise ValueError(f"Orchestrator file not found with this path: {v}")
return v
@classmethod
@field_validator("orchestrator_sheet", mode="before")
def validate_orchestrator_sheet(cls, v):
# orchestrator_sheet is only needed if the group has archive_sheet permission
if cls.permissions.archive_sheet and not os.path.exists(v):
raise ValueError(f"Orchestrator file not found with this path: {v}")
return v
@computed_field
@property
def service_account_email(self) -> str:
if self.orchestrator_sheet is None:
return ""
if hasattr(self, "_service_account_email"):
return self._service_account_email
orch = yaml.safe_load(open(self.orchestrator_sheet))

View File

@@ -1,7 +1,8 @@
steps:
feeder: cli_feeder
feeders:
- cli_feeder
archivers: # order matters
- youtubedl_archiver
- generic_extractor
enrichers:
- hash_enricher
@@ -12,7 +13,7 @@ steps:
- console_db
configurations:
gsheet_feeder:
gsheet_feeder_db:
service_account: "app/tests/fake_service_account.json"
cli_feeder:
urls:

View File

@@ -4,7 +4,7 @@ from unittest.mock import patch
import pytest
from auto_archiver.core import Media, Metadata
from app.shared import schemas
from app.shared import constants, schemas
from app.shared.db import models
from app.web.utils.misc import get_all_urls
from app.worker.main import create_archive_task, create_sheet_task
@@ -119,7 +119,7 @@ class TestCreateSheetTask:
res = create_sheet_task(self.sheet.model_dump_json())
m_args.assert_called_once_with(
"interstellar", True, ["--gsheet_feeder.sheet_id", "123"]
"interstellar", True, [constants.SHEET_ID, "123"]
)
m_orchestrator.return_value.setup.assert_called_once()
m_orchestrator.return_value.feed.assert_called_once()

View File

@@ -1,4 +1,4 @@
VERSION = "0.9.4"
VERSION = "0.10.0"
API_DESCRIPTION = """
#### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.

View File

@@ -84,8 +84,9 @@ def authenticate_user(access_token) -> (bool, str):
if FIREBASE_OAUTH_ENABLED:
try:
return firebase_login_attempt(access_token)
except exceptions.FirebaseError as e:
logger.warning(f"Error verifying ID token: {str(e)[:80]}...")
except exceptions.FirebaseError:
# used a non-Firebase token, fallback to Google OAuth
pass
# https://cloud.google.com/docs/authentication/token-types#access
if not isinstance(access_token, str) or len(access_token) < 10:

View File

@@ -7,7 +7,7 @@ from celery.signals import task_failure
from loguru import logger
from sqlalchemy import exc
from app.shared import business_logic, schemas
from app.shared import business_logic, constants, schemas
from app.shared.db import models, worker_crud
from app.shared.db.database import get_db
from app.shared.log import log_error
@@ -25,10 +25,7 @@ Redis = get_redis()
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
setup_celery_logger(celery)
# TODO: these are temporary PATCHES for new aa's functionality
# logger.add("app/worker/worker_log.log", level="DEBUG")
logger.remove = lambda x: print(f"logger.remove({x})")
AA_LOGGER_ID = None
# TODO: after release, as it requires updating past entries with sheet_id where tag
@@ -41,14 +38,19 @@ logger.remove = lambda x: print(f"logger.remove({x})")
retry_kwargs={"max_retries": 1},
)
def create_archive_task(self, archive_json: str):
global AA_LOGGER_ID
archive = schemas.ArchiveCreate.model_validate_json(archive_json)
# call auto-archiver
args = get_orchestrator_args(archive.group_id, False, [archive.url])
result = None
try:
orchestrator = ArchivingOrchestrator()
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
orchestrator.setup(args)
result = next(orchestrator.feed())
AA_LOGGER_ID = orchestrator.logger_id
for orch_res in orchestrator.feed():
result = orch_res
except SystemExit as e:
log_error(e, "create_archive_task: SystemExit from AA")
except Exception as e:
@@ -68,6 +70,7 @@ def create_archive_task(self, archive_json: str):
@celery.task(name="create_sheet_task", bind=True)
def create_sheet_task(self, sheet_json: str):
global AA_LOGGER_ID
sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
queue_name = (create_sheet_task.request.delivery_info or {}).get(
"routing_key", "unknown"
@@ -75,10 +78,12 @@ def create_sheet_task(self, sheet_json: str):
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
args = get_orchestrator_args(
sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id]
sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
)
orchestrator = ArchivingOrchestrator()
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
orchestrator.setup(args)
AA_LOGGER_ID = orchestrator.logger_id
stats = {"archived": 0, "failed": 0, "errors": []}
try:
@@ -128,8 +133,7 @@ def create_sheet_task(self, sheet_json: str):
def get_orchestrator_args(
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
) -> list:
if cli_args is None:
cli_args = []
cli_args.append("--logging.enabled=false")
aa_configs = []
with get_db() as session:

1357
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,7 +22,6 @@ requires-python = ">=3.10,<3.13"
dependencies = [
"auto-archiver (>=0.13.1)",
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
"celery (>=5.0)",
"redis (==3.5.3)",
"loguru (>=0.7.3,<0.8.0)",