mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-30 14:18:42 +03:00
@@ -12,7 +12,7 @@ To properly set up the API you need to install `docker` and to have these files,
|
|||||||
2. a `user-groups.yaml` to manage user permissions
|
2. a `user-groups.yaml` to manage user permissions
|
||||||
1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
|
1. note that all local files referenced in `user-groups.yaml` and any orchestration.yaml files should be relative to the home directory so if your service account is in `secrets/orchestration.yaml` use that path and not just `orchestration.yaml`.
|
||||||
2. go through the example file and configure it according to your needs.
|
2. go through the example file and configure it according to your needs.
|
||||||
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder` and `gsheet_db` enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
|
3. you will need to create and reference at least one `secrets/orchestration.yaml` file, you can do so by following the instructions in the [auto-archiver](https://github.com/bellingcat/auto-archiver#installation) that automatically generates one for you. If you use the archive sheets feature you will need to create a `orchestrationsheets-sheets.yaml` file as well that should have the `gsheet_feeder_db` feeder and database enabled and configured, the auto-archiver has [extensive documentation](https://auto-archiver.readthedocs.io/en/latest/) on how to set this up.
|
||||||
|
|
||||||
Do not commit those files, they are .gitignored by default.
|
Do not commit those files, they are .gitignored by default.
|
||||||
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.
|
We also advise you to keep any sensitive files in the `secrets/` folder which is pinned and gitignored.
|
||||||
|
|||||||
@@ -2,3 +2,6 @@
|
|||||||
STATUS_FAILURE = "FAILURE"
|
STATUS_FAILURE = "FAILURE"
|
||||||
STATUS_PENDING = "PENDING"
|
STATUS_PENDING = "PENDING"
|
||||||
STATUS_SUCCESS = "SUCCESS"
|
STATUS_SUCCESS = "SUCCESS"
|
||||||
|
|
||||||
|
# AA CLI CONFIGS
|
||||||
|
SHEET_ID = "--gsheet_feeder_db.sheet_id"
|
||||||
|
|||||||
@@ -94,20 +94,31 @@ class GroupPermissions(BaseModel):
|
|||||||
|
|
||||||
class GroupModel(BaseModel):
|
class GroupModel(BaseModel):
|
||||||
description: str
|
description: str
|
||||||
orchestrator: str
|
orchestrator: str | None = None
|
||||||
orchestrator_sheet: str
|
orchestrator_sheet: str | None = None
|
||||||
permissions: GroupPermissions
|
permissions: GroupPermissions
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@field_validator("orchestrator", "orchestrator_sheet", mode="before")
|
@field_validator("orchestrator", mode="before")
|
||||||
def validate_orchestrator(cls, v):
|
def validate_orchestrator(cls, v):
|
||||||
if not os.path.exists(v):
|
# orchestrator is only needed if the group has archive_url permission
|
||||||
|
if cls.permissions.archive_url and not os.path.exists(v):
|
||||||
|
raise ValueError(f"Orchestrator file not found with this path: {v}")
|
||||||
|
return v
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@field_validator("orchestrator_sheet", mode="before")
|
||||||
|
def validate_orchestrator_sheet(cls, v):
|
||||||
|
# orchestrator_sheet is only needed if the group has archive_sheet permission
|
||||||
|
if cls.permissions.archive_sheet and not os.path.exists(v):
|
||||||
raise ValueError(f"Orchestrator file not found with this path: {v}")
|
raise ValueError(f"Orchestrator file not found with this path: {v}")
|
||||||
return v
|
return v
|
||||||
|
|
||||||
@computed_field
|
@computed_field
|
||||||
@property
|
@property
|
||||||
def service_account_email(self) -> str:
|
def service_account_email(self) -> str:
|
||||||
|
if self.orchestrator_sheet is None:
|
||||||
|
return ""
|
||||||
if hasattr(self, "_service_account_email"):
|
if hasattr(self, "_service_account_email"):
|
||||||
return self._service_account_email
|
return self._service_account_email
|
||||||
orch = yaml.safe_load(open(self.orchestrator_sheet))
|
orch = yaml.safe_load(open(self.orchestrator_sheet))
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
steps:
|
steps:
|
||||||
feeder: cli_feeder
|
feeders:
|
||||||
|
- cli_feeder
|
||||||
archivers: # order matters
|
archivers: # order matters
|
||||||
- youtubedl_archiver
|
- generic_extractor
|
||||||
enrichers:
|
enrichers:
|
||||||
- hash_enricher
|
- hash_enricher
|
||||||
|
|
||||||
@@ -12,7 +13,7 @@ steps:
|
|||||||
- console_db
|
- console_db
|
||||||
|
|
||||||
configurations:
|
configurations:
|
||||||
gsheet_feeder:
|
gsheet_feeder_db:
|
||||||
service_account: "app/tests/fake_service_account.json"
|
service_account: "app/tests/fake_service_account.json"
|
||||||
cli_feeder:
|
cli_feeder:
|
||||||
urls:
|
urls:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
from auto_archiver.core import Media, Metadata
|
from auto_archiver.core import Media, Metadata
|
||||||
|
|
||||||
from app.shared import schemas
|
from app.shared import constants, schemas
|
||||||
from app.shared.db import models
|
from app.shared.db import models
|
||||||
from app.web.utils.misc import get_all_urls
|
from app.web.utils.misc import get_all_urls
|
||||||
from app.worker.main import create_archive_task, create_sheet_task
|
from app.worker.main import create_archive_task, create_sheet_task
|
||||||
@@ -119,7 +119,7 @@ class TestCreateSheetTask:
|
|||||||
res = create_sheet_task(self.sheet.model_dump_json())
|
res = create_sheet_task(self.sheet.model_dump_json())
|
||||||
|
|
||||||
m_args.assert_called_once_with(
|
m_args.assert_called_once_with(
|
||||||
"interstellar", True, ["--gsheet_feeder.sheet_id", "123"]
|
"interstellar", True, [constants.SHEET_ID, "123"]
|
||||||
)
|
)
|
||||||
m_orchestrator.return_value.setup.assert_called_once()
|
m_orchestrator.return_value.setup.assert_called_once()
|
||||||
m_orchestrator.return_value.feed.assert_called_once()
|
m_orchestrator.return_value.feed.assert_called_once()
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
VERSION = "0.9.4"
|
VERSION = "0.10.0"
|
||||||
|
|
||||||
API_DESCRIPTION = """
|
API_DESCRIPTION = """
|
||||||
#### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.
|
#### API for the Auto-Archiver project, a tool to archive web pages and Google Sheets.
|
||||||
|
|||||||
@@ -84,8 +84,9 @@ def authenticate_user(access_token) -> (bool, str):
|
|||||||
if FIREBASE_OAUTH_ENABLED:
|
if FIREBASE_OAUTH_ENABLED:
|
||||||
try:
|
try:
|
||||||
return firebase_login_attempt(access_token)
|
return firebase_login_attempt(access_token)
|
||||||
except exceptions.FirebaseError as e:
|
except exceptions.FirebaseError:
|
||||||
logger.warning(f"Error verifying ID token: {str(e)[:80]}...")
|
# used a non-Firebase token, fallback to Google OAuth
|
||||||
|
pass
|
||||||
|
|
||||||
# https://cloud.google.com/docs/authentication/token-types#access
|
# https://cloud.google.com/docs/authentication/token-types#access
|
||||||
if not isinstance(access_token, str) or len(access_token) < 10:
|
if not isinstance(access_token, str) or len(access_token) < 10:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ from celery.signals import task_failure
|
|||||||
from loguru import logger
|
from loguru import logger
|
||||||
from sqlalchemy import exc
|
from sqlalchemy import exc
|
||||||
|
|
||||||
from app.shared import business_logic, schemas
|
from app.shared import business_logic, constants, schemas
|
||||||
from app.shared.db import models, worker_crud
|
from app.shared.db import models, worker_crud
|
||||||
from app.shared.db.database import get_db
|
from app.shared.db.database import get_db
|
||||||
from app.shared.log import log_error
|
from app.shared.log import log_error
|
||||||
@@ -25,10 +25,7 @@ Redis = get_redis()
|
|||||||
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
|
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
|
||||||
|
|
||||||
setup_celery_logger(celery)
|
setup_celery_logger(celery)
|
||||||
|
AA_LOGGER_ID = None
|
||||||
# TODO: these are temporary PATCHES for new aa's functionality
|
|
||||||
# logger.add("app/worker/worker_log.log", level="DEBUG")
|
|
||||||
logger.remove = lambda x: print(f"logger.remove({x})")
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: after release, as it requires updating past entries with sheet_id where tag
|
# TODO: after release, as it requires updating past entries with sheet_id where tag
|
||||||
@@ -41,14 +38,19 @@ logger.remove = lambda x: print(f"logger.remove({x})")
|
|||||||
retry_kwargs={"max_retries": 1},
|
retry_kwargs={"max_retries": 1},
|
||||||
)
|
)
|
||||||
def create_archive_task(self, archive_json: str):
|
def create_archive_task(self, archive_json: str):
|
||||||
|
global AA_LOGGER_ID
|
||||||
archive = schemas.ArchiveCreate.model_validate_json(archive_json)
|
archive = schemas.ArchiveCreate.model_validate_json(archive_json)
|
||||||
|
|
||||||
# call auto-archiver
|
# call auto-archiver
|
||||||
args = get_orchestrator_args(archive.group_id, False, [archive.url])
|
args = get_orchestrator_args(archive.group_id, False, [archive.url])
|
||||||
|
result = None
|
||||||
try:
|
try:
|
||||||
orchestrator = ArchivingOrchestrator()
|
orchestrator = ArchivingOrchestrator()
|
||||||
|
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
|
||||||
orchestrator.setup(args)
|
orchestrator.setup(args)
|
||||||
result = next(orchestrator.feed())
|
AA_LOGGER_ID = orchestrator.logger_id
|
||||||
|
for orch_res in orchestrator.feed():
|
||||||
|
result = orch_res
|
||||||
except SystemExit as e:
|
except SystemExit as e:
|
||||||
log_error(e, "create_archive_task: SystemExit from AA")
|
log_error(e, "create_archive_task: SystemExit from AA")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -68,6 +70,7 @@ def create_archive_task(self, archive_json: str):
|
|||||||
|
|
||||||
@celery.task(name="create_sheet_task", bind=True)
|
@celery.task(name="create_sheet_task", bind=True)
|
||||||
def create_sheet_task(self, sheet_json: str):
|
def create_sheet_task(self, sheet_json: str):
|
||||||
|
global AA_LOGGER_ID
|
||||||
sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
|
sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
|
||||||
queue_name = (create_sheet_task.request.delivery_info or {}).get(
|
queue_name = (create_sheet_task.request.delivery_info or {}).get(
|
||||||
"routing_key", "unknown"
|
"routing_key", "unknown"
|
||||||
@@ -75,10 +78,12 @@ def create_sheet_task(self, sheet_json: str):
|
|||||||
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
|
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
|
||||||
|
|
||||||
args = get_orchestrator_args(
|
args = get_orchestrator_args(
|
||||||
sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id]
|
sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
|
||||||
)
|
)
|
||||||
orchestrator = ArchivingOrchestrator()
|
orchestrator = ArchivingOrchestrator()
|
||||||
|
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
|
||||||
orchestrator.setup(args)
|
orchestrator.setup(args)
|
||||||
|
AA_LOGGER_ID = orchestrator.logger_id
|
||||||
|
|
||||||
stats = {"archived": 0, "failed": 0, "errors": []}
|
stats = {"archived": 0, "failed": 0, "errors": []}
|
||||||
try:
|
try:
|
||||||
@@ -128,8 +133,7 @@ def create_sheet_task(self, sheet_json: str):
|
|||||||
def get_orchestrator_args(
|
def get_orchestrator_args(
|
||||||
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
|
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
|
||||||
) -> list:
|
) -> list:
|
||||||
if cli_args is None:
|
cli_args.append("--logging.enabled=false")
|
||||||
cli_args = []
|
|
||||||
|
|
||||||
aa_configs = []
|
aa_configs = []
|
||||||
with get_db() as session:
|
with get_db() as session:
|
||||||
|
|||||||
1357
poetry.lock
generated
1357
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,6 @@ requires-python = ">=3.10,<3.13"
|
|||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"auto-archiver (>=0.13.1)",
|
"auto-archiver (>=0.13.1)",
|
||||||
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
|
|
||||||
"celery (>=5.0)",
|
"celery (>=5.0)",
|
||||||
"redis (==3.5.3)",
|
"redis (==3.5.3)",
|
||||||
"loguru (>=0.7.3,<0.8.0)",
|
"loguru (>=0.7.3,<0.8.0)",
|
||||||
|
|||||||
Reference in New Issue
Block a user