mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-11 13:08:34 +03:00
updates AA adds missing dependencies
This commit is contained in:
@@ -2,8 +2,7 @@
|
||||
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
from auto_archiver import Metadata
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
from app.shared.db import models
|
||||
|
||||
|
||||
@@ -7,8 +7,7 @@ import pytest
|
||||
|
||||
from app.shared.db import models
|
||||
from app.shared import schemas
|
||||
from auto_archiver import Metadata
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Media, Metadata
|
||||
|
||||
|
||||
|
||||
@@ -16,22 +15,24 @@ class Test_create_archive_task():
|
||||
URL = "https://example-live.com"
|
||||
archive = schemas.ArchiveCreate(url=URL, tags=["tag-celery"], public=True, author_id="rick@example.com", group_id="interstellar")
|
||||
|
||||
@patch("app.worker.main.ArchivingOrchestrator")
|
||||
@patch("app.worker.main.get_all_urls", return_value=[])
|
||||
@patch("app.worker.main.insert_result_into_db")
|
||||
@patch("app.worker.main.get_store_until", return_value=datetime.now())
|
||||
@patch("app.worker.main.load_orchestrator")
|
||||
@patch("app.worker.main.get_orchestrator_args", return_value=["arg1", "arg2"])
|
||||
@patch("celery.app.task.Task.request")
|
||||
def test_success(self, m_req, m_load, m_store, m_insert, db_session):
|
||||
def test_success(self, m_req, m_args, m_store, m_insert, m_urls, m_orchestrator, db_session):
|
||||
from app.worker.main import create_archive_task
|
||||
|
||||
m_req.id = "this-just-in"
|
||||
mock_orchestrator = self.mock_orchestrator_choice(m_load)
|
||||
m_orchestrator.run.return_value = Metadata().set_url(self.URL).success()
|
||||
|
||||
task = create_archive_task(self.archive.model_dump_json())
|
||||
|
||||
m_load.assert_called_once_with("interstellar")
|
||||
m_args.assert_called_once()
|
||||
m_store.assert_called_once_with("interstellar")
|
||||
m_insert.assert_called_once()
|
||||
mock_orchestrator.feed_item.assert_called_once()
|
||||
m_orchestrator.run.assert_called_once()
|
||||
|
||||
assert task["status"] == "success"
|
||||
assert task["metadata"]["url"] == self.URL
|
||||
@@ -43,10 +44,10 @@ class Test_create_archive_task():
|
||||
create_archive_task(self.archive.model_dump_json())
|
||||
|
||||
@patch("app.worker.main.insert_result_into_db", side_effect=Exception)
|
||||
@patch("app.worker.main.load_orchestrator")
|
||||
def test_raise_db_error(self, m_load, m_insert):
|
||||
@patch("app.worker.main.get_orchestrator_args")
|
||||
def test_raise_db_error(self, m_args, m_insert):
|
||||
from app.worker.main import create_archive_task
|
||||
mock_orchestrator = self.mock_orchestrator_choice(m_load)
|
||||
mock_orchestrator = self.mock_orchestrator_choice(m_args)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
create_archive_task(self.archive.model_dump_json())
|
||||
@@ -54,10 +55,10 @@ class Test_create_archive_task():
|
||||
|
||||
|
||||
@patch("app.worker.main.insert_result_into_db", return_value=None)
|
||||
@patch("app.worker.main.load_orchestrator")
|
||||
def test_raise_empty_result(self, m_load, m_insert):
|
||||
@patch("app.worker.main.get_orchestrator_args")
|
||||
def test_raise_empty_result(self, m_args, m_insert):
|
||||
from app.worker.main import create_archive_task
|
||||
mock_orchestrator = self.mock_orchestrator_choice(m_load)
|
||||
mock_orchestrator = self.mock_orchestrator_choice(m_args)
|
||||
|
||||
with pytest.raises(Exception) as e:
|
||||
create_archive_task(self.archive.model_dump_json())
|
||||
@@ -76,8 +77,8 @@ class Test_create_sheet_task():
|
||||
|
||||
@patch("app.worker.main.models.generate_uuid", return_value="constant-uuid")
|
||||
@patch("app.worker.main.get_store_until", return_value=datetime.now())
|
||||
@patch("app.worker.main.load_orchestrator")
|
||||
def test_success(self, m_load, m_store, m_uuid, db_session):
|
||||
@patch("app.worker.main.get_orchestrator_args")
|
||||
def test_success(self, m_args, m_store, m_uuid, db_session):
|
||||
from app.worker.main import create_sheet_task
|
||||
|
||||
assert db_session.query(models.Archive).filter(models.Archive.url == self.URL).count() == 0
|
||||
@@ -86,11 +87,11 @@ class Test_create_sheet_task():
|
||||
mock_metadata.add_media(Media("fn1.txt", urls=["outcome1.com"]))
|
||||
m_orch = MagicMock()
|
||||
m_orch.feed.return_value = iter([False, mock_metadata, mock_metadata])
|
||||
m_load.return_value = m_orch
|
||||
m_args.return_value = m_orch
|
||||
|
||||
res = create_sheet_task(self.sheet.model_dump_json())
|
||||
|
||||
m_load.assert_called_once_with("interstellar", True, {'configurations': {'gsheet_feeder': {'sheet_id': '123'}}})
|
||||
m_args.assert_called_once_with("interstellar", True, {'configurations': {'gsheet_feeder': {'sheet_id': '123'}}})
|
||||
m_orch.feed.assert_called_once()
|
||||
m_store.assert_called_with("interstellar")
|
||||
m_store.call_count == 2
|
||||
@@ -116,7 +117,6 @@ class Test_create_sheet_task():
|
||||
|
||||
def test_get_all_urls(db_session):
|
||||
from app.worker.main import get_all_urls
|
||||
from auto_archiver import Metadata
|
||||
|
||||
meta = Metadata().set_url("https://example.com")
|
||||
m1 = meta.add_media(Media("fn1.txt", urls=["outcome1.com"]))
|
||||
|
||||
@@ -3,7 +3,7 @@ from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
import sqlalchemy
|
||||
from auto_archiver import Metadata
|
||||
from auto_archiver.core import Metadata
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.shared.aa_utils import get_all_urls
|
||||
|
||||
@@ -4,8 +4,8 @@ import traceback, datetime
|
||||
from celery.signals import task_failure
|
||||
from loguru import logger
|
||||
from sqlalchemy import exc
|
||||
|
||||
from auto_archiver import Config, ArchivingOrchestrator, Metadata
|
||||
import auto_archiver
|
||||
from auto_archiver.core.orchestrator import ArchivingOrchestrator
|
||||
|
||||
from app.shared.db import models
|
||||
from app.shared.db.database import get_db
|
||||
@@ -16,7 +16,6 @@ from app.shared.log import log_error
|
||||
from app.shared.aa_utils import get_all_urls
|
||||
from app.shared.db import worker_crud
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
celery = get_celery("worker")
|
||||
@@ -24,19 +23,30 @@ Redis = get_redis()
|
||||
|
||||
USER_GROUPS_FILENAME = settings.USER_GROUPS_FILENAME
|
||||
|
||||
# PATCHES for new aa's functionality
|
||||
# logger.add("app/worker/worker_log.log", level="DEBUG")
|
||||
logger.remove = lambda x: print(f"logger.remove({x})")
|
||||
|
||||
# TODO: after release, as it requires updating past entries with sheet_id where tag is used, drop tags
|
||||
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 0})
|
||||
def create_archive_task(self, archive_json: str):
|
||||
archive = schemas.ArchiveCreate.model_validate_json(archive_json)
|
||||
|
||||
# call auto-archiver
|
||||
orchestrator = load_orchestrator(archive.group_id)
|
||||
result = orchestrator.feed_item(Metadata().set_url(archive.url))
|
||||
args = get_orchestrator_args(archive.group_id, False, [archive.url])
|
||||
# args = get_orchestrator_args(archive.group_id, False, [archive.url, "--extractors", "generic_extractor"])
|
||||
logger.error(args)
|
||||
try:
|
||||
result = next(ArchivingOrchestrator().run(args), None)
|
||||
except SystemExit as e:
|
||||
log_error(e, f"create_archive_task: SystemExit from AA")
|
||||
except Exception as e:
|
||||
log_error(e, f"create_archive_task")
|
||||
raise e
|
||||
assert result, f"UNABLE TO archive: {archive.url}"
|
||||
|
||||
# prepare and insert in DB
|
||||
store_until = get_store_until(archive.group_id)
|
||||
archive.store_until = store_until
|
||||
archive.store_until = get_store_until(archive.group_id)
|
||||
archive.id = self.request.id
|
||||
archive.urls = get_all_urls(result)
|
||||
archive.result = json.loads(result.to_json())
|
||||
@@ -51,32 +61,36 @@ def create_sheet_task(self, sheet_json: str):
|
||||
queue_name = (create_sheet_task.request.delivery_info or {}).get('routing_key', 'unknown')
|
||||
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
|
||||
|
||||
orchestrator = load_orchestrator(sheet.group_id, True, {"configurations": {"gsheet_feeder": {"sheet_id": sheet.sheet_id}}})
|
||||
args = get_orchestrator_args(sheet.group_id, True, ["--gsheet_feeder.sheet_id", sheet.sheet_id])
|
||||
|
||||
stats = {"archived": 0, "failed": 0, "errors": []}
|
||||
for result in orchestrator.feed():
|
||||
try:
|
||||
assert result, f"UNABLE TO archive: {result.get_url()}"
|
||||
archive = schemas.ArchiveCreate(
|
||||
author_id=sheet.author_id,
|
||||
url=result.get_url(),
|
||||
group_id=sheet.group_id,
|
||||
tags=sheet.tags,
|
||||
id=models.generate_uuid(),
|
||||
result=json.loads(result.to_json()),
|
||||
sheet_id=sheet.sheet_id,
|
||||
urls=get_all_urls(result),
|
||||
store_until = get_store_until(sheet.group_id)
|
||||
)
|
||||
insert_result_into_db(archive)
|
||||
stats["archived"] += 1
|
||||
except exc.IntegrityError as e:
|
||||
logger.warning(f"cached result detected: {e}")
|
||||
except Exception as e:
|
||||
log_error(e, extra=f"{self.name}: {sheet_json}")
|
||||
redis_publish_exception(e, self.name, traceback.format_exc())
|
||||
stats["failed"] += 1
|
||||
stats["errors"].append(str(e))
|
||||
try:
|
||||
for result in ArchivingOrchestrator().run(args):
|
||||
try:
|
||||
assert result, f"ERROR archiving URL for sheet {sheet.sheet_id}"
|
||||
archive = schemas.ArchiveCreate(
|
||||
author_id=sheet.author_id,
|
||||
url=result.get_url(),
|
||||
group_id=sheet.group_id,
|
||||
tags=sheet.tags,
|
||||
id=models.generate_uuid(),
|
||||
result=json.loads(result.to_json()),
|
||||
sheet_id=sheet.sheet_id,
|
||||
urls=get_all_urls(result),
|
||||
store_until=get_store_until(sheet.group_id)
|
||||
)
|
||||
insert_result_into_db(archive)
|
||||
stats["archived"] += 1
|
||||
except exc.IntegrityError as e:
|
||||
logger.warning(f"cached result detected: {e}")
|
||||
except Exception as e:
|
||||
log_error(e, extra=f"{self.name}: {sheet_json}")
|
||||
redis_publish_exception(e, self.name, traceback.format_exc())
|
||||
stats["failed"] += 1
|
||||
stats["errors"].append(str(e))
|
||||
|
||||
except SystemExit as e:
|
||||
log_error(e, f"create_sheet_task: SystemExit from AA")
|
||||
|
||||
if stats["archived"] > 0:
|
||||
with get_db() as session:
|
||||
@@ -87,7 +101,8 @@ def create_sheet_task(self, sheet_json: str):
|
||||
return schemas.CelerySheetTask(success=True, sheet_id=sheet.sheet_id, time=datetime.datetime.now().isoformat(), stats=stats).model_dump()
|
||||
|
||||
|
||||
def load_orchestrator(group_id: str, orchestrator_for_sheet: bool = False, overwrite_configs: dict = {}) -> ArchivingOrchestrator:
|
||||
def get_orchestrator_args(group_id: str, orchestrator_for_sheet: bool, cli_args: list = []) -> list:
|
||||
aa_configs = []
|
||||
with get_db() as session:
|
||||
group = worker_crud.get_group(session, group_id)
|
||||
if orchestrator_for_sheet:
|
||||
@@ -95,11 +110,9 @@ def load_orchestrator(group_id: str, orchestrator_for_sheet: bool = False, overw
|
||||
else:
|
||||
orchestrator_fn = worker_crud.get_group(session, group_id).orchestrator
|
||||
assert orchestrator_fn, f"no orchestrator found for {group_id}"
|
||||
|
||||
|
||||
config = Config()
|
||||
config.parse(use_cli=False, yaml_config_filename=orchestrator_fn, overwrite_configs=overwrite_configs)
|
||||
return ArchivingOrchestrator(config)
|
||||
aa_configs.extend(["--config", orchestrator_fn])
|
||||
aa_configs.extend(cli_args)
|
||||
return aa_configs
|
||||
|
||||
|
||||
def insert_result_into_db(archive: schemas.ArchiveCreate) -> str:
|
||||
@@ -108,10 +121,12 @@ def insert_result_into_db(archive: schemas.ArchiveCreate) -> str:
|
||||
logger.debug(f"[ARCHIVE STORED] {db_task.author_id} {db_task.url}")
|
||||
return db_task.id
|
||||
|
||||
|
||||
def get_store_until(group_id: str) -> datetime.datetime:
|
||||
with get_db() as session:
|
||||
return business_logic.get_store_archive_until(session, group_id)
|
||||
|
||||
|
||||
def redis_publish_exception(exception, task_name, traceback: str = ""):
|
||||
REDIS_EXCEPTIONS_CHANNEL = settings.REDIS_EXCEPTIONS_CHANNEL
|
||||
try:
|
||||
|
||||
1384
poetry.lock
generated
1384
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -17,11 +17,11 @@ classifiers = [
|
||||
"Programming Language :: Python :: 3"
|
||||
]
|
||||
|
||||
requires-python = ">=3.10,<4.0"
|
||||
requires-python = ">=3.10,<3.13"
|
||||
|
||||
|
||||
dependencies = [
|
||||
"auto-archiver (>=0.12.0,<0.13.0)",
|
||||
"auto-archiver (>=0.13.1)",
|
||||
"oscrypto @ git+https://github.com/wbond/oscrypto.git@d5f3437ed24257895ae1edd9e503cfb352e635a8",
|
||||
"celery (>=5.0)",
|
||||
"redis (==3.5.3)",
|
||||
@@ -29,10 +29,11 @@ dependencies = [
|
||||
"pydantic-settings (>=2.7.1,<3.0.0)",
|
||||
"sqlalchemy (>=2.0.38,<3.0.0)",
|
||||
"requests (>=2.25.1)",
|
||||
"pyopenssl (==23.3.0)",
|
||||
"pyopenssl (>=23.3.0)",
|
||||
]
|
||||
[tool.poetry.group.worker.dependencies]
|
||||
watchdog = ">=6.0.0,<7.0.0"
|
||||
setuptools = "^75.8.0"
|
||||
|
||||
[tool.poetry.group.web.dependencies]
|
||||
fastapi = ">=0.115.8,<0.116.0"
|
||||
@@ -43,6 +44,7 @@ fastapi-utils = ">=0.8.0,<0.9.0"
|
||||
prometheus-fastapi-instrumentator = ">=7.0.2,<8.0.0"
|
||||
fastapi-mail = ">=1.4.2,<2.0.0"
|
||||
uvicorn = ">=0.13.4"
|
||||
pyyaml = "^6.0.2"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
Reference in New Issue
Block a user