mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-12 13:38:33 +03:00
major refactor of structure for worker V web: docker/app/secrets/envs/...
This commit is contained in:
3
app/web/__init__.py
Normal file
3
app/web/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from app.web.main import app_factory
|
||||
|
||||
app = app_factory
|
||||
0
app/web/endpoints/__init__.py
Normal file
0
app/web/endpoints/__init__.py
Normal file
59
app/web/endpoints/default.py
Normal file
59
app/web/endpoints/default.py
Normal file
@@ -0,0 +1,59 @@
|
||||
|
||||
from typing import Dict
|
||||
from fastapi import APIRouter, Depends, Request, HTTPException
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
|
||||
from app.shared.config import VERSION, BREAKING_CHANGES
|
||||
from app.shared.log import log_error
|
||||
from app.shared.db import crud
|
||||
from app.shared.schemas import ActiveUser, UsageResponse
|
||||
from app.shared.db.user_state import UserState
|
||||
from app.web.security import get_user_auth, bearer_security, get_user_state
|
||||
from app.shared.user_groups import GroupInfo
|
||||
|
||||
default_router = APIRouter()
|
||||
|
||||
|
||||
@default_router.get("/")
|
||||
async def home(request: Request):
|
||||
# TODO: maybe split into 2 routes: one non authenticated and one authenticated for the groups info only, necessary only for the extension
|
||||
status = {"version": VERSION, "breakingChanges": BREAKING_CHANGES}
|
||||
try:
|
||||
email = await get_user_auth(await bearer_security(request))
|
||||
status["groups"] = crud.get_user_groups(email)
|
||||
except HTTPException: pass # not authenticated is fine
|
||||
except Exception as e: log_error(e)
|
||||
return JSONResponse(status)
|
||||
|
||||
|
||||
@default_router.get("/health")
|
||||
async def health():
|
||||
return JSONResponse({"status": "ok"})
|
||||
|
||||
|
||||
@default_router.get("/user/active", summary="Check if the user is active and can use the tool.")
|
||||
async def active(
|
||||
user: UserState = Depends(get_user_state),
|
||||
) -> ActiveUser:
|
||||
return {"active": user.active}
|
||||
|
||||
|
||||
@default_router.get("/user/permissions", summary="Get the user's global 'all' permissions and the permissions for each group they belong to.")
|
||||
def get_user_permissions(
|
||||
user: UserState = Depends(get_user_state),
|
||||
) -> Dict[str, GroupInfo]:
|
||||
return user.permissions
|
||||
|
||||
@default_router.get("/user/usage", summary="Get the user's monthly URLs/MBs usage along with the total active sheets, breakdown by group.")
|
||||
def get_user_usage(
|
||||
user: UserState = Depends(get_user_state),
|
||||
) -> UsageResponse:
|
||||
if not user.active:
|
||||
raise HTTPException(status_code=403, detail="User is not active.")
|
||||
return user.usage()
|
||||
|
||||
|
||||
|
||||
@default_router.get('/favicon.ico', include_in_schema=False)
|
||||
async def favicon() -> FileResponse:
|
||||
return FileResponse("web/static/favicon.ico")
|
||||
51
app/web/endpoints/interoperability.py
Normal file
51
app/web/endpoints/interoperability.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import json
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
import sqlalchemy
|
||||
from auto_archiver import Metadata
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.shared.aa_utils import get_all_urls
|
||||
from app.shared.config import ALLOW_ANY_EMAIL
|
||||
from app.shared import business_logic, schemas
|
||||
from app.shared.db import crud
|
||||
from app.shared.db.database import get_db_dependency
|
||||
from app.web.security import token_api_key_auth
|
||||
from app.shared.db import models
|
||||
from app.shared.log import log_error
|
||||
|
||||
|
||||
interoperability_router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])
|
||||
|
||||
|
||||
# ----- endpoint to submit data archived elsewhere
|
||||
@interoperability_router.post("/submit-archive", status_code=201, summary="Submit a manual archive entry, for data that was archived elsewhere.")
|
||||
def submit_manual_archive(
|
||||
manual: schemas.SubmitManualArchive,
|
||||
auth=Depends(token_api_key_auth),
|
||||
db: Session = Depends(get_db_dependency)
|
||||
):
|
||||
result: Metadata = Metadata.from_json(manual.result)
|
||||
manual.author_id = manual.author_id or ALLOW_ANY_EMAIL
|
||||
manual.tags.add("manual")
|
||||
|
||||
try:
|
||||
archive = schemas.ArchiveCreate(
|
||||
author_id=manual.author_id,
|
||||
url=result.get_url(),
|
||||
public=manual.public,
|
||||
group_id=manual.group_id,
|
||||
tags=manual.tags,
|
||||
id=models.generate_uuid(),
|
||||
result=json.loads(result.to_json()),
|
||||
urls=get_all_urls(result),
|
||||
store_until=business_logic.get_store_archive_until(db, manual.group_id),
|
||||
)
|
||||
|
||||
db_archive = crud.store_archived_url(db, archive)
|
||||
logger.debug(f"[MANUAL ARCHIVE STORED] {db_archive.author_id} {db_archive.url}")
|
||||
return JSONResponse({"id": db_archive.id}, status_code=201)
|
||||
except sqlalchemy.exc.IntegrityError as e:
|
||||
log_error(e)
|
||||
raise HTTPException(status_code=422, detail=f"Cannot insert into DB due to integrity error, likely duplicate urls.")
|
||||
80
app/web/endpoints/sheet.py
Normal file
80
app/web/endpoints/sheet.py
Normal file
@@ -0,0 +1,80 @@
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from sqlalchemy import exc
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.shared.db.user_state import UserState
|
||||
from app.shared import schemas
|
||||
from app.shared.task_messaging import get_celery
|
||||
from app.web.security import get_user_state
|
||||
from app.shared.db import crud
|
||||
from app.shared.db.database import get_db_dependency
|
||||
|
||||
sheet_router = APIRouter(prefix="/sheet", tags=["Google Spreadsheet operations"])
|
||||
|
||||
celery = get_celery()
|
||||
|
||||
@sheet_router.post("/create", status_code=201, summary="Store a new Google Sheet for regular archiving.")
|
||||
def create_sheet(
|
||||
sheet: schemas.SheetAdd,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency),
|
||||
) -> schemas.SheetResponse:
|
||||
|
||||
if not user.in_group(sheet.group_id):
|
||||
raise HTTPException(status_code=403, detail="User does not have access to this group.")
|
||||
|
||||
if not user.has_quota_monthly_sheets(sheet.group_id):
|
||||
raise HTTPException(status_code=429, detail="User has reached their sheet quota for this group.")
|
||||
|
||||
if not user.is_sheet_frequency_allowed(sheet.group_id, sheet.frequency):
|
||||
raise HTTPException(status_code=422, detail="Invalid frequency selected for this group.")
|
||||
|
||||
try:
|
||||
return crud.create_sheet(db, sheet.id, sheet.name, user.email, sheet.group_id, sheet.frequency)
|
||||
except exc.IntegrityError as e:
|
||||
raise HTTPException(status_code=400, detail="Sheet with this ID is already being archived.") from e
|
||||
|
||||
|
||||
@sheet_router.get("/mine", status_code=200, summary="Get the authenticated user's Google Sheets.")
|
||||
def get_user_sheets(
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency)
|
||||
) -> list[schemas.SheetResponse]:
|
||||
return crud.get_user_sheets(db, user.email)
|
||||
|
||||
|
||||
@sheet_router.delete("/{id}", summary="Delete a Google Sheet by ID.")
|
||||
def delete_sheet(
|
||||
id: str,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency),
|
||||
) -> schemas.TaskDelete:
|
||||
return JSONResponse({
|
||||
"id": id,
|
||||
"deleted": crud.delete_sheet(db, id, user.email)
|
||||
})
|
||||
|
||||
|
||||
@sheet_router.post("/{id}/archive", status_code=201, summary="Trigger an archiving task for a GSheet you own.", response_description="task_id for the archiving task.")
|
||||
def archive_user_sheet(
|
||||
id: str,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency),
|
||||
) -> schemas.Task:
|
||||
|
||||
sheet = crud.get_user_sheet(db, user.email, sheet_id=id)
|
||||
if not sheet:
|
||||
raise HTTPException(status_code=403, detail="No access to this sheet.")
|
||||
|
||||
if not user.in_group(sheet.group_id):
|
||||
raise HTTPException(status_code=403, detail="User does not have access to this group.")
|
||||
|
||||
if not user.can_manually_trigger(sheet.group_id):
|
||||
raise HTTPException(status_code=429, detail="User cannot manually trigger sheet archiving in this group.")
|
||||
|
||||
task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=id, author_id=user.email, group_id=sheet.group_id).model_dump_json()]).delay()
|
||||
|
||||
return JSONResponse({"id": task.id}, status_code=201)
|
||||
40
app/web/endpoints/task.py
Normal file
40
app/web/endpoints/task.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from celery.result import AsyncResult
|
||||
from fastapi import APIRouter, Depends
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from app.shared.task_messaging import get_celery
|
||||
from app.web.security import get_token_or_user_auth
|
||||
from app.shared import schemas
|
||||
from app.shared.log import log_error
|
||||
from app.web.utils.misc import custom_jsonable_encoder
|
||||
|
||||
|
||||
task_router = APIRouter(prefix="/task", tags=["Async task operations"])
|
||||
|
||||
celery = get_celery()
|
||||
|
||||
@task_router.get("/{task_id}", summary="Check the status of an async task by its id, works for URLs and Sheet tasks.")
|
||||
def get_status(task_id, email=Depends(get_token_or_user_auth)) -> schemas.TaskResult:
|
||||
task = AsyncResult(task_id, app=celery)
|
||||
try:
|
||||
if task.status == "FAILURE":
|
||||
# *FAILURE* The task raised an exception, or has exceeded the retry limit.
|
||||
# The :attr:`result` attribute then contains the exception raised by the task.
|
||||
# https://docs.celeryq.dev/en/stable/_modules/celery/result.html#AsyncResult
|
||||
raise task.result
|
||||
|
||||
response = {
|
||||
"id": task_id,
|
||||
"status": task.status,
|
||||
"result": task.result
|
||||
}
|
||||
return JSONResponse(jsonable_encoder(response, exclude_unset=True, custom_encoder={bytes: custom_jsonable_encoder}))
|
||||
|
||||
except Exception as e:
|
||||
log_error(e)
|
||||
return JSONResponse({
|
||||
"id": task_id,
|
||||
"status": "FAILURE",
|
||||
"result": {"error": str(e)}
|
||||
})
|
||||
77
app/web/endpoints/url.py
Normal file
77
app/web/endpoints/url.py
Normal file
@@ -0,0 +1,77 @@
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.shared.config import ALLOW_ANY_EMAIL
|
||||
from app.shared import schemas
|
||||
from app.shared.task_messaging import get_celery
|
||||
from app.web.security import get_token_or_user_auth, get_user_state
|
||||
from app.shared.db import crud
|
||||
from app.shared.db.user_state import UserState
|
||||
from app.shared.db.database import get_db_dependency
|
||||
|
||||
from urllib.parse import urlparse
|
||||
|
||||
url_router = APIRouter(prefix="/url", tags=["Single URL operations"])
|
||||
|
||||
celery = get_celery()
|
||||
|
||||
@url_router.post("/archive", status_code=201, summary="Submit a single URL archive request, starts an archiving task.", response_description="task_id for the archiving task, will match the archive id.")
|
||||
def archive_url(
|
||||
archive: schemas.ArchiveTrigger,
|
||||
email=Depends(get_token_or_user_auth),
|
||||
db: Session = Depends(get_db_dependency)
|
||||
) -> schemas.Task:
|
||||
archive.author_id = email
|
||||
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {archive.url}")
|
||||
|
||||
parsed_url = urlparse(archive.url)
|
||||
if not all([parsed_url.scheme, parsed_url.netloc]):
|
||||
raise HTTPException(status_code=400, detail="Invalid URL received.")
|
||||
|
||||
if email != ALLOW_ANY_EMAIL:
|
||||
user = UserState(db, email)
|
||||
if archive.group_id and not user.in_group(archive.group_id):
|
||||
raise HTTPException(status_code=403, detail="User does not have access to this group.")
|
||||
if not user.has_quota_max_monthly_urls(archive.group_id):
|
||||
raise HTTPException(status_code=429, detail="User has reached their monthly URL quota.")
|
||||
if not user.has_quota_max_monthly_mbs(archive.group_id):
|
||||
raise HTTPException(status_code=429, detail="User has reached their monthly MB quota.")
|
||||
|
||||
archive_create = schemas.ArchiveCreate(**archive.model_dump())
|
||||
|
||||
task = celery.signature("create_archive_task", args=[archive_create.model_dump_json()]).delay()
|
||||
task_response = schemas.Task(id=task.id)
|
||||
return JSONResponse(task_response.model_dump(), status_code=201)
|
||||
|
||||
|
||||
@url_router.get("/search", summary="Search for archive entries by URL.")
|
||||
def search_by_url(
|
||||
url: str, skip: int = 0, limit: int = 25,
|
||||
archived_after: datetime = None, archived_before: datetime = None,
|
||||
db: Session = Depends(get_db_dependency),
|
||||
email: str = Depends(get_token_or_user_auth)
|
||||
) -> list[schemas.ArchiveResult]:
|
||||
|
||||
if email != ALLOW_ANY_EMAIL:
|
||||
user = UserState(db, email)
|
||||
if not user.read and not user.read_public:
|
||||
raise HTTPException(status_code=403, detail="User does not have read access.")
|
||||
|
||||
return crud.search_archives_by_url(db, url.strip(), email, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
|
||||
|
||||
|
||||
@url_router.delete("/{id}", summary="Delete a single URL archive by id.")
|
||||
def delete_task(
|
||||
id:str,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency)
|
||||
) -> schemas.TaskDelete:
|
||||
logger.info(f"deleting url archive task {id} request by {user.email}")
|
||||
return JSONResponse({
|
||||
"id": id,
|
||||
"deleted": crud.soft_delete_task(db, id, user.email)
|
||||
})
|
||||
179
app/web/events.py
Normal file
179
app/web/events.py
Normal file
@@ -0,0 +1,179 @@
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
import datetime
|
||||
import logging
|
||||
import alembic.config
|
||||
from fastapi import FastAPI
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi_utils.tasks import repeat_every
|
||||
from loguru import logger
|
||||
from fastapi_mail import FastMail, MessageSchema, MessageType
|
||||
|
||||
from app.shared.db import crud, models
|
||||
from app.shared.db.database import get_db, get_db_async, make_engine, wal_checkpoint
|
||||
from app.shared import schemas
|
||||
from app.shared.settings import get_settings
|
||||
from app.shared.task_messaging import get_celery
|
||||
from app.web.utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions
|
||||
|
||||
celery = get_celery()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# see https://fastapi.tiangolo.com/advanced/events/#lifespan
|
||||
|
||||
# STARTUP
|
||||
logger.debug("HERE 00")
|
||||
engine = make_engine(get_settings().DATABASE_PATH)
|
||||
models.Base.metadata.create_all(bind=engine)
|
||||
logger.debug("HERE 01")
|
||||
alembic.config.main(prog="alembic", argv=['--raiseerr', 'upgrade', 'head'])
|
||||
logger.debug("HERE 02")
|
||||
logging.getLogger("uvicorn.access").disabled = True # loguru
|
||||
asyncio.create_task(redis_subscribe_worker_exceptions(get_settings().REDIS_EXCEPTIONS_CHANNEL))
|
||||
asyncio.create_task(repeat_measure_regular_metrics())
|
||||
with get_db() as db:
|
||||
crud.upsert_user_groups(db)
|
||||
|
||||
# setup archive cronjobs
|
||||
if get_settings().CRON_ARCHIVE_SHEETS:
|
||||
asyncio.create_task(archive_hourly_sheets_cronjob())
|
||||
asyncio.create_task(archive_daily_sheets_cronjob())
|
||||
else:
|
||||
logger.warning("[CRON] Sheet archive cronjobs are disabled.")
|
||||
|
||||
if get_settings().CRON_DELETE_STALE_SHEETS:
|
||||
asyncio.create_task(delete_stale_sheets())
|
||||
else:
|
||||
logger.warning("[CRON] Delete stale sheets cronjob is disabled.")
|
||||
|
||||
if get_settings().CRON_DELETE_SCHEDULED_ARCHIVES:
|
||||
asyncio.create_task(notify_about_expired_archives())
|
||||
else:
|
||||
logger.warning("[CRON] Delete scheduled archives cronjob is disabled.")
|
||||
|
||||
wal_checkpoint()
|
||||
|
||||
yield # separates startup from shutdown instructions
|
||||
|
||||
# SHUTDOWN
|
||||
logger.info("shutting down")
|
||||
|
||||
|
||||
# CRON JOBS
|
||||
@repeat_every(seconds=get_settings().REPEAT_COUNT_METRICS_SECONDS, on_exception=logger.error)
|
||||
async def repeat_measure_regular_metrics():
|
||||
await measure_regular_metrics(get_settings().DATABASE_PATH, get_settings().REPEAT_COUNT_METRICS_SECONDS)
|
||||
|
||||
|
||||
@repeat_every(seconds=60, wait_first=120, on_exception=logger.error)
|
||||
async def archive_hourly_sheets_cronjob():
|
||||
await archive_sheets_cronjob("hourly", 60, datetime.datetime.now().minute)
|
||||
|
||||
|
||||
@repeat_every(seconds=3600, wait_first=120, on_exception=logger.error)
|
||||
async def archive_daily_sheets_cronjob():
|
||||
await archive_sheets_cronjob("daily", 24, datetime.datetime.now().hour)
|
||||
|
||||
|
||||
async def archive_sheets_cronjob(frequency: str, interval: int, current_time_unit: int):
|
||||
triggered_jobs = []
|
||||
|
||||
async with get_db_async() as db:
|
||||
sheets = await crud.get_sheets_by_id_hash(db, frequency, interval, current_time_unit)
|
||||
for s in sheets:
|
||||
task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=s.id, author_id=s.author_id, group_id=s.group_id).model_dump_json()]).apply_async()
|
||||
|
||||
triggered_jobs.append({"sheet_id": s.id, "task_id": task.id})
|
||||
logger.info(f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}")
|
||||
|
||||
|
||||
# TODO: on exception should logerror but also prometheus counter
|
||||
DELETE_WINDOW = get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS * 24 * 60 * 60
|
||||
|
||||
|
||||
@repeat_every(seconds=DELETE_WINDOW, wait_first=180, on_exception=logger.error)
|
||||
async def notify_about_expired_archives():
|
||||
notify_from = datetime.datetime.now() + datetime.timedelta(days=get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS)
|
||||
async with get_db_async() as db:
|
||||
scheduled_deletions = await crud.find_by_store_until(db, notify_from)
|
||||
|
||||
user_archives = defaultdict(list)
|
||||
for archive in scheduled_deletions:
|
||||
user_archives[archive.author_id].append(archive)
|
||||
|
||||
if user_archives:
|
||||
fastmail = FastMail(get_settings().MAIL_CONFIG)
|
||||
# notify users
|
||||
for email in user_archives:
|
||||
list_of_archives = "\n".join([f'{a.url},{a.id}<br/>' for a in user_archives[email]])
|
||||
# TODO: how can users download them in bulk?
|
||||
message = MessageSchema(
|
||||
subject="Auto Archiver: Archives Scheduled for Deletion",
|
||||
recipients=[email],
|
||||
body=f"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Hi {email},</p>
|
||||
<p>Some of your archives will be deleted in the next {get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS} days, as they are reaching their expiration date according to our retention policy for their groups.</p>
|
||||
<p>If you want to preserve any, make sure to download them now.</p>
|
||||
<p>Here is a CSV list of URLs:</p>
|
||||
<code>
|
||||
url,archive_id<br/>
|
||||
{list_of_archives}
|
||||
</code>
|
||||
<p>Best,<br>The Auto Archiver team</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
subtype=MessageType.html
|
||||
)
|
||||
await fastmail.send_message(message)
|
||||
logger.info(f"[CRON] Email sent to {email} about {len(user_archives[email])} scheduled archives deletion.")
|
||||
|
||||
# now schedule the deletion event
|
||||
asyncio.create_task(delete_expired_archives())
|
||||
|
||||
|
||||
@repeat_every(max_repetitions=1, wait_first=DELETE_WINDOW - (60 * 60), seconds=0, on_exception=logger.error)
|
||||
async def delete_expired_archives():
|
||||
async with get_db_async() as db:
|
||||
count_deleted = await crud.soft_delete_expired_archives(db)
|
||||
if count_deleted:
|
||||
logger.info(f"[CRON] Deleted {count_deleted} archives.")
|
||||
|
||||
|
||||
@repeat_every(seconds=86400, wait_first=150, on_exception=logger.error)
|
||||
async def delete_stale_sheets():
|
||||
STALE_DAYS = get_settings().DELETE_STALE_SHEETS_DAYS
|
||||
logger.info(f"[CRON] Deleting stale sheets older than {STALE_DAYS} days.")
|
||||
async with get_db_async() as db:
|
||||
user_sheets = await crud.delete_stale_sheets(db, STALE_DAYS)
|
||||
|
||||
if not user_sheets: return
|
||||
|
||||
fastmail = FastMail(get_settings().MAIL_CONFIG)
|
||||
# notify users
|
||||
for email in user_sheets:
|
||||
list_of_sheets = "\n".join([f'<li><a href="https://docs.google.com/spreadsheets/d/{s.id}">{s.name}</a></li>' for s in user_sheets[email]])
|
||||
message = MessageSchema(
|
||||
subject="Auto Archiver: Stale Sheets Removed",
|
||||
recipients=[email],
|
||||
body=f"""
|
||||
<html>
|
||||
<body>
|
||||
<p>Hi {email},</p>
|
||||
<p>Your stale sheets have been removed from our system as no new URL was archived in the past {STALE_DAYS} days:</p>
|
||||
<ul>
|
||||
{list_of_sheets}
|
||||
</ul>
|
||||
<p>You can always re-add them at https://auto-archiver.bellingcat.com/.</p>
|
||||
<p>Best,<br>The Auto Archiver team</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
subtype=MessageType.html
|
||||
)
|
||||
await fastmail.send_message(message)
|
||||
logger.info(f"[CRON] Email sent to {email} about stale sheets deletion.")
|
||||
174
app/web/main.py
Normal file
174
app/web/main.py
Normal file
@@ -0,0 +1,174 @@
|
||||
import os
|
||||
from celery.result import AsyncResult
|
||||
from fastapi import FastAPI, Depends, HTTPException
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import Session
|
||||
from loguru import logger
|
||||
|
||||
from app.shared.log import log_error
|
||||
from app.web.middleware import logging_middleware
|
||||
from app.shared import schemas
|
||||
from app.shared.task_messaging import get_celery
|
||||
|
||||
from app.shared.db import crud
|
||||
from app.web.security import get_user_auth, token_api_key_auth, get_token_or_user_auth
|
||||
from app.shared.config import VERSION, API_DESCRIPTION
|
||||
from app.shared.db.database import get_db_dependency
|
||||
from app.web.events import lifespan
|
||||
from app.shared.settings import get_settings
|
||||
|
||||
|
||||
from app.web.endpoints.default import default_router
|
||||
from app.web.endpoints.url import url_router
|
||||
from app.web.endpoints.sheet import sheet_router
|
||||
from app.web.endpoints.task import task_router
|
||||
from app.web.endpoints.interoperability import interoperability_router
|
||||
|
||||
celery = get_celery()
|
||||
|
||||
def app_factory(settings = get_settings()):
|
||||
app = FastAPI(
|
||||
title="Auto-Archiver API",
|
||||
description=API_DESCRIPTION,
|
||||
version=VERSION,
|
||||
contact={"name": "GitHub", "url": "https://github.com/bellingcat/auto-archiver-api"},
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=settings.ALLOWED_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
app.middleware("http")(logging_middleware)
|
||||
|
||||
app.include_router(default_router)
|
||||
app.include_router(url_router)
|
||||
app.include_router(sheet_router)
|
||||
app.include_router(task_router)
|
||||
app.include_router(interoperability_router)
|
||||
|
||||
# prometheus exposed in /metrics with authentication
|
||||
Instrumentator(should_group_status_codes=False, excluded_handlers=["/metrics", "/health", "/openapi.json", "/favicon.ico"]).instrument(app).expose(app, dependencies=[Depends(token_api_key_auth)])
|
||||
|
||||
# TODO: recheck this for security, currently only needed for when local_storage is used
|
||||
local_dir = settings.SERVE_LOCAL_ARCHIVE
|
||||
if not os.path.isdir(local_dir) and os.path.isdir(local_dir.replace("/app", ".")):
|
||||
local_dir = local_dir.replace("/app", ".")
|
||||
if len(settings.SERVE_LOCAL_ARCHIVE) > 1 and os.path.isdir(local_dir):
|
||||
logger.warning(f"MOUNTing local archive {settings.SERVE_LOCAL_ARCHIVE}")
|
||||
app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE)
|
||||
|
||||
|
||||
|
||||
# -----Submit URL and manipulate tasks. Bearer protected below
|
||||
|
||||
|
||||
@app.get("/tasks/search-url", response_model=list[schemas.Archive], deprecated=True) # DEPRECATED
|
||||
def search_by_url(url: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, db: Session = Depends(get_db_dependency), email=Depends(get_token_or_user_auth)):
|
||||
return crud.search_archives_by_url(db, url.strip(), email, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
|
||||
|
||||
|
||||
@app.get("/tasks/sync", response_model=list[schemas.Archive], deprecated=True) # DEPRECATED
|
||||
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db_dependency), email=Depends(get_user_auth)):
|
||||
return crud.search_archives_by_email(db, email, skip=skip, limit=limit)
|
||||
|
||||
|
||||
@app.post("/tasks", status_code=201, deprecated=True) # DEPRECATED
|
||||
def archive_tasks(archive: schemas.ArchiveCreate, email=Depends(get_token_or_user_auth)):
|
||||
archive.author_id = email
|
||||
url = archive.url
|
||||
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
|
||||
if type(url) != str or len(url) <= 5:
|
||||
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
|
||||
logger.info("creating task")
|
||||
|
||||
task = celery.signature("create_archive_task", args=[archive.model_dump_json()]).delay()
|
||||
return JSONResponse({"id": task.id})
|
||||
|
||||
|
||||
@app.get("/archive/{task_id}", deprecated=True) # DEPRECATED
|
||||
def lookup(task_id, db: Session = Depends(get_db_dependency), email=Depends(get_token_or_user_auth)):
|
||||
return crud.get_archive(db, task_id, email)
|
||||
|
||||
|
||||
@app.get("/tasks/{task_id}", deprecated=True) # DEPRECATED
|
||||
def get_status(task_id, email=Depends(get_token_or_user_auth)):
|
||||
logger.info(f"status check for user {email} task {task_id}")
|
||||
task = AsyncResult(task_id, app=celery)
|
||||
try:
|
||||
if task.status == "FAILURE":
|
||||
# *FAILURE* The task raised an exception, or has exceeded the retry limit.
|
||||
# The :attr:`result` attribute then contains the exception raised by the task.
|
||||
# https://docs.celeryq.dev/en/stable/_modules/celery/result.html#AsyncResult
|
||||
raise task.result
|
||||
|
||||
response = {
|
||||
"id": task_id,
|
||||
"status": task.status,
|
||||
"result": task.result
|
||||
}
|
||||
return JSONResponse(jsonable_encoder(response, exclude_unset=True))
|
||||
|
||||
except Exception as e:
|
||||
log_error(e)
|
||||
return JSONResponse({
|
||||
"id": task_id,
|
||||
"status": "FAILURE",
|
||||
"result": {"error": str(e)}
|
||||
})
|
||||
|
||||
|
||||
@app.delete("/tasks/{task_id}", deprecated=True) # DEPRECATED
|
||||
def delete_task(task_id, db: Session = Depends(get_db_dependency), email=Depends(get_user_auth)):
|
||||
logger.info(f"deleting task {task_id} request by {email}")
|
||||
return JSONResponse({
|
||||
"id": task_id,
|
||||
"deleted": crud.soft_delete_task(db, task_id, email)
|
||||
})
|
||||
|
||||
# ----- Google Sheets Logic
|
||||
|
||||
|
||||
@app.post("/sheet", status_code=201, deprecated=True) # DEPRECATED
|
||||
def archive_sheet(sheet: schemas.SubmitSheet, email=Depends(get_user_auth), db: Session = Depends(get_db_dependency)):
|
||||
logger.info(f"SHEET TASK for {sheet=}")
|
||||
sheet.author_id = email
|
||||
if not crud.is_user_in_group(db, email, sheet.group_id):
|
||||
raise HTTPException(status_code=403, detail="User does not have access to this group.")
|
||||
task = celery.signature("create_sheet_task", args=[sheet.model_dump_json()]).delay()
|
||||
return JSONResponse({"id": task.id})
|
||||
|
||||
|
||||
@app.post("/sheet_service", status_code=201, deprecated=True) # DEPRECATED
|
||||
def archive_sheet_service(sheet: schemas.SubmitSheet, auth=Depends(token_api_key_auth)):
|
||||
logger.info(f"SHEET TASK for {sheet=}")
|
||||
sheet.author_id = sheet.author_id or "api-endpoint"
|
||||
|
||||
task = celery.signature("create_sheet_task", args=[sheet.model_dump_json()]).delay()
|
||||
return JSONResponse({"id": task.id})
|
||||
|
||||
# ----- endpoint to submit data archived elsewhere
|
||||
|
||||
|
||||
@app.post("/submit-archive", status_code=201, deprecated=True) # DEPRECATED
|
||||
def submit_manual_archive(manual: schemas.SubmitManual, auth=Depends(token_api_key_auth)):
|
||||
raise HTTPException(status_code=410, detail="This endpoint is deprecated. Use /interop/submit-archive instead.")
|
||||
# result = Metadata.from_json(manual.result)
|
||||
# logger.info(f"MANUAL SUBMIT {result.get_url()} {manual.author_id}")
|
||||
# manual.tags.add("manual")
|
||||
# try:
|
||||
# # archive_id = insert_result_into_db(result, manual.tags, manual.public, manual.group_id, manual.author_id, models.generate_uuid())
|
||||
# except sqlalchemy.exc.IntegrityError as e:
|
||||
# log_error(e)
|
||||
# raise HTTPException(status_code=422, detail=f"Cannot insert into DB due to integrity error")
|
||||
# return JSONResponse({"id": archive_id})
|
||||
|
||||
return app
|
||||
17
app/web/middleware.py
Normal file
17
app/web/middleware.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
from loguru import logger
|
||||
from fastapi import Request
|
||||
from app.shared.log import log_error
|
||||
|
||||
|
||||
async def logging_middleware(request: Request, call_next):
|
||||
try:
|
||||
response = await call_next(request)
|
||||
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
|
||||
return response
|
||||
except Exception as e:
|
||||
from web.utils.metrics import EXCEPTION_COUNTER
|
||||
EXCEPTION_COUNTER.labels(type=e.__class__.__name__).inc()
|
||||
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - {e.__class__.__name__} {e}")
|
||||
log_error(e)
|
||||
raise e
|
||||
83
app/web/security.py
Normal file
83
app/web/security.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from loguru import logger
|
||||
import requests, secrets
|
||||
from fastapi import HTTPException, status, Depends
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
|
||||
from app.shared.config import ALLOW_ANY_EMAIL
|
||||
from app.shared.settings import get_settings
|
||||
from app.shared.db.database import get_db
|
||||
from app.shared.db.user_state import UserState
|
||||
|
||||
settings = get_settings()
|
||||
bearer_security = HTTPBearer()
|
||||
|
||||
|
||||
def secure_compare(token, api_key):
|
||||
return secrets.compare_digest(token.encode("utf8"), api_key.encode("utf8"))
|
||||
|
||||
|
||||
# Factory method to create an authentication dependency for a specific key
|
||||
def api_key_auth(api_key):
|
||||
assert len(api_key) >= 20, "Invalid API key, must be at least 20 chars"
|
||||
|
||||
async def auth(bearer: HTTPAuthorizationCredentials = Depends(bearer_security), auto_error=True):
|
||||
is_correct = secure_compare(bearer.credentials, api_key)
|
||||
if is_correct: return True
|
||||
|
||||
if auto_error:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Wrong auth credentials",
|
||||
)
|
||||
return False
|
||||
|
||||
return auth
|
||||
|
||||
|
||||
# --------------------- Token Auth for AA itself to query the API, AA setup tool and Prometheus
|
||||
token_api_key_auth = api_key_auth(settings.API_BEARER_TOKEN)
|
||||
|
||||
|
||||
async def get_token_or_user_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
|
||||
# tries to use the static API_KEY and defaults to google JWT auth
|
||||
if await token_api_key_auth(credentials, auto_error=False): return ALLOW_ANY_EMAIL
|
||||
return await get_user_auth(credentials)
|
||||
|
||||
|
||||
async def get_user_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
|
||||
# validates the Bearer token in the case that it requires it
|
||||
valid_user, info = authenticate_user(credentials.credentials)
|
||||
if valid_user:
|
||||
return info.lower()
|
||||
logger.debug(f"TOKEN FAILURE: {valid_user=} {info=}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail=info,
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
|
||||
def authenticate_user(access_token):
|
||||
# https://cloud.google.com/docs/authentication/token-types#access
|
||||
if type(access_token) != str or len(access_token) < 10: return False, "invalid access_token"
|
||||
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token": access_token})
|
||||
if r.status_code != 200: return False, "invalid token"
|
||||
try:
|
||||
j = r.json()
|
||||
if j.get("azp") not in settings.CHROME_APP_IDS and j.get("aud") not in settings.CHROME_APP_IDS:
|
||||
return False, f"token does not belong to valid APP_ID"
|
||||
if j.get("email") in settings.BLOCKED_EMAILS:
|
||||
return False, f"email '{j.get('email')}' not allowed"
|
||||
if j.get("email_verified") != "true":
|
||||
return False, f"email '{j.get('email')}' not verified"
|
||||
if int(j.get("expires_in", -1)) <= 0:
|
||||
return False, "Token expired"
|
||||
return True, j.get('email').lower()
|
||||
except Exception as e:
|
||||
logger.warning(f"AUTH EXCEPTION occurred: {e}")
|
||||
return False, "exception occurred"
|
||||
|
||||
|
||||
def get_user_state(email=Depends(get_user_auth)):
|
||||
with get_db() as db:
|
||||
return UserState(db, email)
|
||||
0
app/web/static/.gitkeep
Normal file
0
app/web/static/.gitkeep
Normal file
BIN
app/web/static/favicon.ico
Normal file
BIN
app/web/static/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 93 KiB |
0
app/web/utils/__init__.py
Normal file
0
app/web/utils/__init__.py
Normal file
69
app/web/utils/metrics.py
Normal file
69
app/web/utils/metrics.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from prometheus_client import Counter, Gauge
|
||||
|
||||
from app.shared.db import crud
|
||||
from app.shared.db.database import get_db
|
||||
from app.shared.log import log_error
|
||||
from app.shared.task_messaging import get_redis
|
||||
|
||||
|
||||
# Custom metrics
|
||||
EXCEPTION_COUNTER = Counter(
|
||||
"exceptions",
|
||||
"Number of times a certain exception has occurred.",
|
||||
labelnames=["type"]
|
||||
)
|
||||
WORKER_EXCEPTION = Counter(
|
||||
"worker_exceptions_total",
|
||||
"Number of times a certain exception has occurred on the worker.",
|
||||
labelnames=["type", "exception", "task", "traceback"]
|
||||
)
|
||||
DISK_UTILIZATION = Gauge(
|
||||
"disk_utilization",
|
||||
"Disk utilization in GB",
|
||||
labelnames=["type"]
|
||||
)
|
||||
DATABASE_METRICS = Gauge(
|
||||
"database_metrics",
|
||||
"Database metric readings at a certain point in time",
|
||||
labelnames=["query"]
|
||||
)
|
||||
DATABASE_METRICS_COUNTER = Counter(
|
||||
"database_metrics_counter",
|
||||
"Database metrics that increase over time",
|
||||
labelnames=["query", "user"]
|
||||
)
|
||||
|
||||
|
||||
async def redis_subscribe_worker_exceptions(REDIS_EXCEPTIONS_CHANNEL: str):
|
||||
# Subscribe to Redis channel and increment the counter for each exception with info on the exception and task
|
||||
Redis = get_redis()
|
||||
PubSubExceptions = Redis.pubsub()
|
||||
PubSubExceptions.subscribe(REDIS_EXCEPTIONS_CHANNEL)
|
||||
while True:
|
||||
message = PubSubExceptions.get_message()
|
||||
if message and message["type"] == "message":
|
||||
data = json.loads(message["data"].decode("utf-8"))
|
||||
WORKER_EXCEPTION.labels(type=data["type"], exception=data["exception"], task=data["task"], traceback=data["traceback"]).inc()
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
async def measure_regular_metrics(sqlite_db_url: str, repeat_in_seconds: int):
|
||||
_total, used, free = shutil.disk_usage("/")
|
||||
DISK_UTILIZATION.labels(type="used").set(used / (2**30))
|
||||
DISK_UTILIZATION.labels(type="free").set(free / (2**30))
|
||||
try:
|
||||
fs = os.stat(sqlite_db_url.replace("sqlite:///", ""))
|
||||
DISK_UTILIZATION.labels(type="database").set(fs.st_size / (2**30))
|
||||
except Exception as e: log_error(e)
|
||||
|
||||
with get_db() as db:
|
||||
DATABASE_METRICS.labels(query="count_archives").set(crud.count_archives(db))
|
||||
DATABASE_METRICS.labels(query="count_archive_urls").set(crud.count_archive_urls(db))
|
||||
DATABASE_METRICS.labels(query="count_users").set(crud.count_users(db))
|
||||
|
||||
for user in crud.count_by_user_since(db, repeat_in_seconds):
|
||||
DATABASE_METRICS_COUNTER.labels(query="count_by_user", user=user.author_id).inc(user.total)
|
||||
7
app/web/utils/misc.py
Normal file
7
app/web/utils/misc.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import base64
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
|
||||
def custom_jsonable_encoder(obj):
|
||||
if isinstance(obj, bytes):
|
||||
return base64.b64encode(obj).decode('utf-8')
|
||||
return jsonable_encoder(obj)
|
||||
Reference in New Issue
Block a user