mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-14 22:48:35 +03:00
@@ -12,7 +12,7 @@ To properly set up the API you need to install `docker` and to have these files,
|
||||
2. a `user-groups.yaml` to manage user permissions
|
||||
1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
|
||||
2. go through the example file and configure it according to your needs.
|
||||
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
|
||||
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder_db` feeder and database enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
|
||||
|
||||
Do not commit those files, they are .gitignored by default.
|
||||
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.
|
||||
|
||||
@@ -2,3 +2,6 @@
|
||||
STATUS_FAILURE = "FAILURE"
|
||||
STATUS_PENDING = "PENDING"
|
||||
STATUS_SUCCESS = "SUCCESS"
|
||||
|
||||
# AA CLI CONFIGS
|
||||
SHEET_ID = "--gsheet_feeder_db.sheet_id"
|
||||
|
||||
@@ -94,20 +94,31 @@ class GroupPermissions(BaseModel):
|
||||
|
||||
class GroupModel(BaseModel):
|
||||
description: str
|
||||
orchestrator: str
|
||||
orchestrator_sheet: str
|
||||
orchestrator: str | None = None
|
||||
orchestrator_sheet: str | None = None
|
||||
permissions: GroupPermissions
|
||||
|
||||
@classmethod
|
||||
@field_validator("orchestrator", "orchestrator_sheet", mode="before")
|
||||
@field_validator("orchestrator", mode="before")
|
||||
def validate_orchestrator(cls, v):
|
||||
if not os.path.exists(v):
|
||||
# orchestrator is only needed if the group has archive_url permission
|
||||
if cls.permissions.archive_url and not os.path.exists(v):
|
||||
raise ValueError(f"Orchestrator file not found with this path: {v}")
|
||||
return v
|
||||
|
||||
@classmethod
|
||||
@field_validator("orchestrator_sheet", mode="before")
|
||||
def validate_orchestrator_sheet(cls, v):
|
||||
# orchestrator_sheet is only needed if the group has archive_sheet permission
|
||||
if cls.permissions.archive_sheet and not os.path.exists(v):
|
||||
raise ValueError(f"Orchestrator file not found with this path: {v}")
|
||||
return v
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def service_account_email(self) -> str:
|
||||
if self.orchestrator_sheet is None:
|
||||
return ""
|
||||
if hasattr(self, "_service_account_email"):
|
||||
return self._service_account_email
|
||||
orch = yaml.safe_load(open(self.orchestrator_sheet))
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
steps:
|
||||
feeder: cli_feeder
|
||||
feeders:
|
||||
- cli_feeder
|
||||
archivers: # order matters
|
||||
- youtubedl_archiver
|
||||
- generic_extractor
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
|
||||
@@ -12,7 +13,7 @@ steps:
|
||||
- console_db
|
||||
|
||||
configurations:
|
||||
gsheet_feeder:
|
||||
gsheet_feeder_db:
|
||||
service_account: "app/tests/fake_service_account.json"
|
||||
cli_feeder:
|
||||
urls:
|
||||
|
||||
@@ -4,7 +4,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
from app.shared import schemas
|
||||
from app.shared import constants, schemas
|
||||
from app.shared.db import models
|
||||
from app.web.utils.misc import get_all_urls
|
||||
from app.worker.main import create_archive_task, create_sheet_task
|
||||
@@ -119,7 +119,7 @@ class TestCreateSheetTask:
|
||||
res = create_sheet_task(self.sheet.model_dump_json())
|
||||
|
||||
m_args.assert_called_once_with(
|
||||
"interstellar", True, ["--gsheet_feeder.sheet_id", "123"]
|
||||
"interstellar", True, [constants.SHEET_ID, "123"]
|
||||
)
|
||||
m_orchestrator.return_value.setup.assert_called_once()
|
||||
m_orchestrator.return_value.feed.assert_called_once()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
VERSION = "0.9.4"
|
||||
VERSION = "0.10.0"
|
||||
|
||||
API_DESCRIPTION = """
|
||||
#### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.
|
||||
|
||||
@@ -84,8 +84,9 @@ def authenticate_user(access_token) -> (bool, str):
|
||||
if FIREBASE_OAUTH_ENABLED:
|
||||
try:
|
||||
return firebase_login_attempt(access_token)
|
||||
except exceptions.FirebaseError as e:
|
||||
logger.warning(f"Error verifying ID token: {str(e)[:80]}...")
|
||||
except exceptions.FirebaseError:
|
||||
# used a non-Firebase token, fallback to Google OAuth
|
||||
pass
|
||||
|
||||
# https://cloud.google.com/docs/authentication/token-types#access
|
||||
if not isinstance(access_token, str) or len(access_token) < 10:
|
||||
|
||||
@@ -7,7 +7,7 @@ from celery.signals import task_failure
|
||||
from loguru import logger
|
||||
from sqlalchemy import exc
|
||||
|
||||
from app.shared import business_logic, schemas
|
||||
from app.shared import business_logic, constants, schemas
|
||||
from app.shared.db import models, worker_crud
|
||||
from app.shared.db.database import get_db
|
||||
from app.shared.log import log_error
|
||||
@@ -25,10 +25,7 @@ Redis = get_redis()
|
||||
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
|
||||
|
||||
setup_celery_logger(celery)
|
||||
|
||||
# TODO: these are temporary PATCHES for new aa's functionality
|
||||
# logger.add("app/worker/worker_log.log", level="DEBUG")
|
||||
logger.remove = lambda x: print(f"logger.remove({x})")
|
||||
AA_LOGGER_ID = None
|
||||
|
||||
|
||||
# TODO: after release, as it requires updating past entries with sheet_id where tag
|
||||
@@ -41,14 +38,19 @@ logger.remove = lambda x: print(f"logger.remove({x})")
|
||||
retry_kwargs={"max_retries": 1},
|
||||
)
|
||||
def create_archive_task(self, archive_json: str):
|
||||
global AA_LOGGER_ID
|
||||
archive = schemas.ArchiveCreate.model_validate_json(archive_json)
|
||||
|
||||
# call auto-archiver
|
||||
args = get_orchestrator_args(archive.group_id, False, [archive.url])
|
||||
result = None
|
||||
try:
|
||||
orchestrator = ArchivingOrchestrator()
|
||||
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
|
||||
orchestrator.setup(args)
|
||||
result = next(orchestrator.feed())
|
||||
AA_LOGGER_ID = orchestrator.logger_id
|
||||
for orch_res in orchestrator.feed():
|
||||
result = orch_res
|
||||
except SystemExit as e:
|
||||
log_error(e, "create_archive_task: SystemExit from AA")
|
||||
except Exception as e:
|
||||
@@ -68,6 +70,7 @@ def create_archive_task(self, archive_json: str):
|
||||
|
||||
@celery.task(name="create_sheet_task", bind=True)
|
||||
def create_sheet_task(self, sheet_json: str):
|
||||
global AA_LOGGER_ID
|
||||
sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
|
||||
queue_name = (create_sheet_task.request.delivery_info or {}).get(
|
||||
"routing_key", "unknown"
|
||||
@@ -75,10 +78,12 @@ def create_sheet_task(self, sheet_json: str):
|
||||
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
|
||||
|
||||
args = get_orchestrator_args(
|
||||
sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id]
|
||||
sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
|
||||
)
|
||||
orchestrator = ArchivingOrchestrator()
|
||||
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
|
||||
orchestrator.setup(args)
|
||||
AA_LOGGER_ID = orchestrator.logger_id
|
||||
|
||||
stats = {"archived": 0, "failed": 0, "errors": []}
|
||||
try:
|
||||
@@ -128,8 +133,7 @@ def create_sheet_task(self, sheet_json: str):
|
||||
def get_orchestrator_args(
|
||||
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
|
||||
) -> list:
|
||||
if cli_args is None:
|
||||
cli_args = []
|
||||
cli_args.append("--logging.enabled=false")
|
||||
|
||||
aa_configs = []
|
||||
with get_db() as session:
|
||||
|
||||
1357
poetry.lock
generated
1357
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,6 @@ requires-python = ">=3.10,<3.13"
|
||||
|
||||
dependencies = [
|
||||
"auto-archiver (>=0.13.1)",
|
||||
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
|
||||
"celery (>=5.0)",
|
||||
"redis (==3.5.3)",
|
||||
"loguru (>=0.7.3,<0.8.0)",
|
||||
|
||||
Reference in New Issue
Block a user