major refactor of structure for worker V web: docker/app/secrets/envs/...

This commit is contained in:
msramalho
2025-02-10 00:41:50 +00:00
parent a1b730bad4
commit f8c45e2d92
74 changed files with 567 additions and 525 deletions

3
app/web/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from app.web.main import app_factory
app = app_factory

View File

View File

@@ -0,0 +1,59 @@
from typing import Dict
from fastapi import APIRouter, Depends, Request, HTTPException
from fastapi.responses import FileResponse, JSONResponse
from app.shared.config import VERSION, BREAKING_CHANGES
from app.shared.log import log_error
from app.shared.db import crud
from app.shared.schemas import ActiveUser, UsageResponse
from app.shared.db.user_state import UserState
from app.web.security import get_user_auth, bearer_security, get_user_state
from app.shared.user_groups import GroupInfo
default_router = APIRouter()
@default_router.get("/")
async def home(request: Request):
# TODO: maybe split into 2 routes: one non authenticated and one authenticated for the groups info only, necessary only for the extension
status = {"version": VERSION, "breakingChanges": BREAKING_CHANGES}
try:
email = await get_user_auth(await bearer_security(request))
status["groups"] = crud.get_user_groups(email)
except HTTPException: pass # not authenticated is fine
except Exception as e: log_error(e)
return JSONResponse(status)
@default_router.get("/health")
async def health():
return JSONResponse({"status": "ok"})
@default_router.get("/user/active", summary="Check if the user is active and can use the tool.")
async def active(
user: UserState = Depends(get_user_state),
) -> ActiveUser:
return {"active": user.active}
@default_router.get("/user/permissions", summary="Get the user's global 'all' permissions and the permissions for each group they belong to.")
def get_user_permissions(
user: UserState = Depends(get_user_state),
) -> Dict[str, GroupInfo]:
return user.permissions
@default_router.get("/user/usage", summary="Get the user's monthly URLs/MBs usage along with the total active sheets, breakdown by group.")
def get_user_usage(
user: UserState = Depends(get_user_state),
) -> UsageResponse:
if not user.active:
raise HTTPException(status_code=403, detail="User is not active.")
return user.usage()
@default_router.get('/favicon.ico', include_in_schema=False)
async def favicon() -> FileResponse:
return FileResponse("web/static/favicon.ico")

View File

@@ -0,0 +1,51 @@
import json
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
import sqlalchemy
from auto_archiver import Metadata
from sqlalchemy.orm import Session
from app.shared.aa_utils import get_all_urls
from app.shared.config import ALLOW_ANY_EMAIL
from app.shared import business_logic, schemas
from app.shared.db import crud
from app.shared.db.database import get_db_dependency
from app.web.security import token_api_key_auth
from app.shared.db import models
from app.shared.log import log_error
interoperability_router = APIRouter(prefix="/interop", tags=["Interoperability endpoints."])
# ----- endpoint to submit data archived elsewhere
@interoperability_router.post("/submit-archive", status_code=201, summary="Submit a manual archive entry, for data that was archived elsewhere.")
def submit_manual_archive(
manual: schemas.SubmitManualArchive,
auth=Depends(token_api_key_auth),
db: Session = Depends(get_db_dependency)
):
result: Metadata = Metadata.from_json(manual.result)
manual.author_id = manual.author_id or ALLOW_ANY_EMAIL
manual.tags.add("manual")
try:
archive = schemas.ArchiveCreate(
author_id=manual.author_id,
url=result.get_url(),
public=manual.public,
group_id=manual.group_id,
tags=manual.tags,
id=models.generate_uuid(),
result=json.loads(result.to_json()),
urls=get_all_urls(result),
store_until=business_logic.get_store_archive_until(db, manual.group_id),
)
db_archive = crud.store_archived_url(db, archive)
logger.debug(f"[MANUAL ARCHIVE STORED] {db_archive.author_id} {db_archive.url}")
return JSONResponse({"id": db_archive.id}, status_code=201)
except sqlalchemy.exc.IntegrityError as e:
log_error(e)
raise HTTPException(status_code=422, detail=f"Cannot insert into DB due to integrity error, likely duplicate urls.")

View File

@@ -0,0 +1,80 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse
from sqlalchemy import exc
from sqlalchemy.orm import Session
from app.shared.db.user_state import UserState
from app.shared import schemas
from app.shared.task_messaging import get_celery
from app.web.security import get_user_state
from app.shared.db import crud
from app.shared.db.database import get_db_dependency
sheet_router = APIRouter(prefix="/sheet", tags=["Google Spreadsheet operations"])
celery = get_celery()
@sheet_router.post("/create", status_code=201, summary="Store a new Google Sheet for regular archiving.")
def create_sheet(
sheet: schemas.SheetAdd,
user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency),
) -> schemas.SheetResponse:
if not user.in_group(sheet.group_id):
raise HTTPException(status_code=403, detail="User does not have access to this group.")
if not user.has_quota_monthly_sheets(sheet.group_id):
raise HTTPException(status_code=429, detail="User has reached their sheet quota for this group.")
if not user.is_sheet_frequency_allowed(sheet.group_id, sheet.frequency):
raise HTTPException(status_code=422, detail="Invalid frequency selected for this group.")
try:
return crud.create_sheet(db, sheet.id, sheet.name, user.email, sheet.group_id, sheet.frequency)
except exc.IntegrityError as e:
raise HTTPException(status_code=400, detail="Sheet with this ID is already being archived.") from e
@sheet_router.get("/mine", status_code=200, summary="Get the authenticated user's Google Sheets.")
def get_user_sheets(
user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency)
) -> list[schemas.SheetResponse]:
return crud.get_user_sheets(db, user.email)
@sheet_router.delete("/{id}", summary="Delete a Google Sheet by ID.")
def delete_sheet(
id: str,
user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency),
) -> schemas.TaskDelete:
return JSONResponse({
"id": id,
"deleted": crud.delete_sheet(db, id, user.email)
})
@sheet_router.post("/{id}/archive", status_code=201, summary="Trigger an archiving task for a GSheet you own.", response_description="task_id for the archiving task.")
def archive_user_sheet(
id: str,
user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency),
) -> schemas.Task:
sheet = crud.get_user_sheet(db, user.email, sheet_id=id)
if not sheet:
raise HTTPException(status_code=403, detail="No access to this sheet.")
if not user.in_group(sheet.group_id):
raise HTTPException(status_code=403, detail="User does not have access to this group.")
if not user.can_manually_trigger(sheet.group_id):
raise HTTPException(status_code=429, detail="User cannot manually trigger sheet archiving in this group.")
task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=id, author_id=user.email, group_id=sheet.group_id).model_dump_json()]).delay()
return JSONResponse({"id": task.id}, status_code=201)

40
app/web/endpoints/task.py Normal file
View File

@@ -0,0 +1,40 @@
from celery.result import AsyncResult
from fastapi import APIRouter, Depends
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from app.shared.task_messaging import get_celery
from app.web.security import get_token_or_user_auth
from app.shared import schemas
from app.shared.log import log_error
from app.web.utils.misc import custom_jsonable_encoder
task_router = APIRouter(prefix="/task", tags=["Async task operations"])
celery = get_celery()
@task_router.get("/{task_id}", summary="Check the status of an async task by its id, works for URLs and Sheet tasks.")
def get_status(task_id, email=Depends(get_token_or_user_auth)) -> schemas.TaskResult:
task = AsyncResult(task_id, app=celery)
try:
if task.status == "FAILURE":
# *FAILURE* The task raised an exception, or has exceeded the retry limit.
# The :attr:`result` attribute then contains the exception raised by the task.
# https://docs.celeryq.dev/en/stable/_modules/celery/result.html#AsyncResult
raise task.result
response = {
"id": task_id,
"status": task.status,
"result": task.result
}
return JSONResponse(jsonable_encoder(response, exclude_unset=True, custom_encoder={bytes: custom_jsonable_encoder}))
except Exception as e:
log_error(e)
return JSONResponse({
"id": task_id,
"status": "FAILURE",
"result": {"error": str(e)}
})

77
app/web/endpoints/url.py Normal file
View File

@@ -0,0 +1,77 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse
from datetime import datetime
from loguru import logger
from sqlalchemy.orm import Session
from app.shared.config import ALLOW_ANY_EMAIL
from app.shared import schemas
from app.shared.task_messaging import get_celery
from app.web.security import get_token_or_user_auth, get_user_state
from app.shared.db import crud
from app.shared.db.user_state import UserState
from app.shared.db.database import get_db_dependency
from urllib.parse import urlparse
url_router = APIRouter(prefix="/url", tags=["Single URL operations"])
celery = get_celery()
@url_router.post("/archive", status_code=201, summary="Submit a single URL archive request, starts an archiving task.", response_description="task_id for the archiving task, will match the archive id.")
def archive_url(
archive: schemas.ArchiveTrigger,
email=Depends(get_token_or_user_auth),
db: Session = Depends(get_db_dependency)
) -> schemas.Task:
archive.author_id = email
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {archive.url}")
parsed_url = urlparse(archive.url)
if not all([parsed_url.scheme, parsed_url.netloc]):
raise HTTPException(status_code=400, detail="Invalid URL received.")
if email != ALLOW_ANY_EMAIL:
user = UserState(db, email)
if archive.group_id and not user.in_group(archive.group_id):
raise HTTPException(status_code=403, detail="User does not have access to this group.")
if not user.has_quota_max_monthly_urls(archive.group_id):
raise HTTPException(status_code=429, detail="User has reached their monthly URL quota.")
if not user.has_quota_max_monthly_mbs(archive.group_id):
raise HTTPException(status_code=429, detail="User has reached their monthly MB quota.")
archive_create = schemas.ArchiveCreate(**archive.model_dump())
task = celery.signature("create_archive_task", args=[archive_create.model_dump_json()]).delay()
task_response = schemas.Task(id=task.id)
return JSONResponse(task_response.model_dump(), status_code=201)
@url_router.get("/search", summary="Search for archive entries by URL.")
def search_by_url(
url: str, skip: int = 0, limit: int = 25,
archived_after: datetime = None, archived_before: datetime = None,
db: Session = Depends(get_db_dependency),
email: str = Depends(get_token_or_user_auth)
) -> list[schemas.ArchiveResult]:
if email != ALLOW_ANY_EMAIL:
user = UserState(db, email)
if not user.read and not user.read_public:
raise HTTPException(status_code=403, detail="User does not have read access.")
return crud.search_archives_by_url(db, url.strip(), email, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
@url_router.delete("/{id}", summary="Delete a single URL archive by id.")
def delete_task(
id:str,
user: UserState = Depends(get_user_state),
db: Session = Depends(get_db_dependency)
) -> schemas.TaskDelete:
logger.info(f"deleting url archive task {id} request by {user.email}")
return JSONResponse({
"id": id,
"deleted": crud.soft_delete_task(db, id, user.email)
})

179
app/web/events.py Normal file
View File

@@ -0,0 +1,179 @@
import asyncio
from collections import defaultdict
import datetime
import logging
import alembic.config
from fastapi import FastAPI
from contextlib import asynccontextmanager
from fastapi_utils.tasks import repeat_every
from loguru import logger
from fastapi_mail import FastMail, MessageSchema, MessageType
from app.shared.db import crud, models
from app.shared.db.database import get_db, get_db_async, make_engine, wal_checkpoint
from app.shared import schemas
from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery
from app.web.utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions
celery = get_celery()
@asynccontextmanager
async def lifespan(app: FastAPI):
# see https://fastapi.tiangolo.com/advanced/events/#lifespan
# STARTUP
logger.debug("HERE 00")
engine = make_engine(get_settings().DATABASE_PATH)
models.Base.metadata.create_all(bind=engine)
logger.debug("HERE 01")
alembic.config.main(prog="alembic", argv=['--raiseerr', 'upgrade', 'head'])
logger.debug("HERE 02")
logging.getLogger("uvicorn.access").disabled = True # loguru
asyncio.create_task(redis_subscribe_worker_exceptions(get_settings().REDIS_EXCEPTIONS_CHANNEL))
asyncio.create_task(repeat_measure_regular_metrics())
with get_db() as db:
crud.upsert_user_groups(db)
# setup archive cronjobs
if get_settings().CRON_ARCHIVE_SHEETS:
asyncio.create_task(archive_hourly_sheets_cronjob())
asyncio.create_task(archive_daily_sheets_cronjob())
else:
logger.warning("[CRON] Sheet archive cronjobs are disabled.")
if get_settings().CRON_DELETE_STALE_SHEETS:
asyncio.create_task(delete_stale_sheets())
else:
logger.warning("[CRON] Delete stale sheets cronjob is disabled.")
if get_settings().CRON_DELETE_SCHEDULED_ARCHIVES:
asyncio.create_task(notify_about_expired_archives())
else:
logger.warning("[CRON] Delete scheduled archives cronjob is disabled.")
wal_checkpoint()
yield # separates startup from shutdown instructions
# SHUTDOWN
logger.info("shutting down")
# CRON JOBS
@repeat_every(seconds=get_settings().REPEAT_COUNT_METRICS_SECONDS, on_exception=logger.error)
async def repeat_measure_regular_metrics():
await measure_regular_metrics(get_settings().DATABASE_PATH, get_settings().REPEAT_COUNT_METRICS_SECONDS)
@repeat_every(seconds=60, wait_first=120, on_exception=logger.error)
async def archive_hourly_sheets_cronjob():
await archive_sheets_cronjob("hourly", 60, datetime.datetime.now().minute)
@repeat_every(seconds=3600, wait_first=120, on_exception=logger.error)
async def archive_daily_sheets_cronjob():
await archive_sheets_cronjob("daily", 24, datetime.datetime.now().hour)
async def archive_sheets_cronjob(frequency: str, interval: int, current_time_unit: int):
triggered_jobs = []
async with get_db_async() as db:
sheets = await crud.get_sheets_by_id_hash(db, frequency, interval, current_time_unit)
for s in sheets:
task = celery.signature("create_sheet_task", args=[schemas.SubmitSheet(sheet_id=s.id, author_id=s.author_id, group_id=s.group_id).model_dump_json()]).apply_async()
triggered_jobs.append({"sheet_id": s.id, "task_id": task.id})
logger.info(f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}")
# TODO: on exception should logerror but also prometheus counter
DELETE_WINDOW = get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS * 24 * 60 * 60
@repeat_every(seconds=DELETE_WINDOW, wait_first=180, on_exception=logger.error)
async def notify_about_expired_archives():
notify_from = datetime.datetime.now() + datetime.timedelta(days=get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS)
async with get_db_async() as db:
scheduled_deletions = await crud.find_by_store_until(db, notify_from)
user_archives = defaultdict(list)
for archive in scheduled_deletions:
user_archives[archive.author_id].append(archive)
if user_archives:
fastmail = FastMail(get_settings().MAIL_CONFIG)
# notify users
for email in user_archives:
list_of_archives = "\n".join([f'{a.url},{a.id}<br/>' for a in user_archives[email]])
# TODO: how can users download them in bulk?
message = MessageSchema(
subject="Auto Archiver: Archives Scheduled for Deletion",
recipients=[email],
body=f"""
<html>
<body>
<p>Hi {email},</p>
<p>Some of your archives will be deleted in the next {get_settings().DELETE_SCHEDULED_ARCHIVES_NOTIFY_DAYS} days, as they are reaching their expiration date according to our retention policy for their groups.</p>
<p>If you want to preserve any, make sure to download them now.</p>
<p>Here is a CSV list of URLs:</p>
<code>
url,archive_id<br/>
{list_of_archives}
</code>
<p>Best,<br>The Auto Archiver team</p>
</body>
</html>
""",
subtype=MessageType.html
)
await fastmail.send_message(message)
logger.info(f"[CRON] Email sent to {email} about {len(user_archives[email])} scheduled archives deletion.")
# now schedule the deletion event
asyncio.create_task(delete_expired_archives())
@repeat_every(max_repetitions=1, wait_first=DELETE_WINDOW - (60 * 60), seconds=0, on_exception=logger.error)
async def delete_expired_archives():
async with get_db_async() as db:
count_deleted = await crud.soft_delete_expired_archives(db)
if count_deleted:
logger.info(f"[CRON] Deleted {count_deleted} archives.")
@repeat_every(seconds=86400, wait_first=150, on_exception=logger.error)
async def delete_stale_sheets():
STALE_DAYS = get_settings().DELETE_STALE_SHEETS_DAYS
logger.info(f"[CRON] Deleting stale sheets older than {STALE_DAYS} days.")
async with get_db_async() as db:
user_sheets = await crud.delete_stale_sheets(db, STALE_DAYS)
if not user_sheets: return
fastmail = FastMail(get_settings().MAIL_CONFIG)
# notify users
for email in user_sheets:
list_of_sheets = "\n".join([f'<li><a href="https://docs.google.com/spreadsheets/d/{s.id}">{s.name}</a></li>' for s in user_sheets[email]])
message = MessageSchema(
subject="Auto Archiver: Stale Sheets Removed",
recipients=[email],
body=f"""
<html>
<body>
<p>Hi {email},</p>
<p>Your stale sheets have been removed from our system as no new URL was archived in the past {STALE_DAYS} days:</p>
<ul>
{list_of_sheets}
</ul>
<p>You can always re-add them at https://auto-archiver.bellingcat.com/.</p>
<p>Best,<br>The Auto Archiver team</p>
</body>
</html>
""",
subtype=MessageType.html
)
await fastmail.send_message(message)
logger.info(f"[CRON] Email sent to {email} about stale sheets deletion.")

174
app/web/main.py Normal file
View File

@@ -0,0 +1,174 @@
import os
from celery.result import AsyncResult
from fastapi import FastAPI, Depends, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from prometheus_fastapi_instrumentator import Instrumentator
from datetime import datetime
from sqlalchemy.orm import Session
from loguru import logger
from app.shared.log import log_error
from app.web.middleware import logging_middleware
from app.shared import schemas
from app.shared.task_messaging import get_celery
from app.shared.db import crud
from app.web.security import get_user_auth, token_api_key_auth, get_token_or_user_auth
from app.shared.config import VERSION, API_DESCRIPTION
from app.shared.db.database import get_db_dependency
from app.web.events import lifespan
from app.shared.settings import get_settings
from app.web.endpoints.default import default_router
from app.web.endpoints.url import url_router
from app.web.endpoints.sheet import sheet_router
from app.web.endpoints.task import task_router
from app.web.endpoints.interoperability import interoperability_router
celery = get_celery()
def app_factory(settings = get_settings()):
app = FastAPI(
title="Auto-Archiver API",
description=API_DESCRIPTION,
version=VERSION,
contact={"name": "GitHub", "url": "https://github.com/bellingcat/auto-archiver-api"},
lifespan=lifespan
)
app.add_middleware(
CORSMiddleware,
allow_origins=settings.ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.middleware("http")(logging_middleware)
app.include_router(default_router)
app.include_router(url_router)
app.include_router(sheet_router)
app.include_router(task_router)
app.include_router(interoperability_router)
# prometheus exposed in /metrics with authentication
Instrumentator(should_group_status_codes=False, excluded_handlers=["/metrics", "/health", "/openapi.json", "/favicon.ico"]).instrument(app).expose(app, dependencies=[Depends(token_api_key_auth)])
# TODO: recheck this for security, currently only needed for when local_storage is used
local_dir = settings.SERVE_LOCAL_ARCHIVE
if not os.path.isdir(local_dir) and os.path.isdir(local_dir.replace("/app", ".")):
local_dir = local_dir.replace("/app", ".")
if len(settings.SERVE_LOCAL_ARCHIVE) > 1 and os.path.isdir(local_dir):
logger.warning(f"MOUNTing local archive {settings.SERVE_LOCAL_ARCHIVE}")
app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE)
# -----Submit URL and manipulate tasks. Bearer protected below
@app.get("/tasks/search-url", response_model=list[schemas.Archive], deprecated=True) # DEPRECATED
def search_by_url(url: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, db: Session = Depends(get_db_dependency), email=Depends(get_token_or_user_auth)):
return crud.search_archives_by_url(db, url.strip(), email, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
@app.get("/tasks/sync", response_model=list[schemas.Archive], deprecated=True) # DEPRECATED
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db_dependency), email=Depends(get_user_auth)):
return crud.search_archives_by_email(db, email, skip=skip, limit=limit)
@app.post("/tasks", status_code=201, deprecated=True) # DEPRECATED
def archive_tasks(archive: schemas.ArchiveCreate, email=Depends(get_token_or_user_auth)):
archive.author_id = email
url = archive.url
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
if type(url) != str or len(url) <= 5:
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
logger.info("creating task")
task = celery.signature("create_archive_task", args=[archive.model_dump_json()]).delay()
return JSONResponse({"id": task.id})
@app.get("/archive/{task_id}", deprecated=True) # DEPRECATED
def lookup(task_id, db: Session = Depends(get_db_dependency), email=Depends(get_token_or_user_auth)):
return crud.get_archive(db, task_id, email)
@app.get("/tasks/{task_id}", deprecated=True) # DEPRECATED
def get_status(task_id, email=Depends(get_token_or_user_auth)):
logger.info(f"status check for user {email} task {task_id}")
task = AsyncResult(task_id, app=celery)
try:
if task.status == "FAILURE":
# *FAILURE* The task raised an exception, or has exceeded the retry limit.
# The :attr:`result` attribute then contains the exception raised by the task.
# https://docs.celeryq.dev/en/stable/_modules/celery/result.html#AsyncResult
raise task.result
response = {
"id": task_id,
"status": task.status,
"result": task.result
}
return JSONResponse(jsonable_encoder(response, exclude_unset=True))
except Exception as e:
log_error(e)
return JSONResponse({
"id": task_id,
"status": "FAILURE",
"result": {"error": str(e)}
})
@app.delete("/tasks/{task_id}", deprecated=True) # DEPRECATED
def delete_task(task_id, db: Session = Depends(get_db_dependency), email=Depends(get_user_auth)):
logger.info(f"deleting task {task_id} request by {email}")
return JSONResponse({
"id": task_id,
"deleted": crud.soft_delete_task(db, task_id, email)
})
# ----- Google Sheets Logic
@app.post("/sheet", status_code=201, deprecated=True) # DEPRECATED
def archive_sheet(sheet: schemas.SubmitSheet, email=Depends(get_user_auth), db: Session = Depends(get_db_dependency)):
logger.info(f"SHEET TASK for {sheet=}")
sheet.author_id = email
if not crud.is_user_in_group(db, email, sheet.group_id):
raise HTTPException(status_code=403, detail="User does not have access to this group.")
task = celery.signature("create_sheet_task", args=[sheet.model_dump_json()]).delay()
return JSONResponse({"id": task.id})
@app.post("/sheet_service", status_code=201, deprecated=True) # DEPRECATED
def archive_sheet_service(sheet: schemas.SubmitSheet, auth=Depends(token_api_key_auth)):
logger.info(f"SHEET TASK for {sheet=}")
sheet.author_id = sheet.author_id or "api-endpoint"
task = celery.signature("create_sheet_task", args=[sheet.model_dump_json()]).delay()
return JSONResponse({"id": task.id})
# ----- endpoint to submit data archived elsewhere
@app.post("/submit-archive", status_code=201, deprecated=True) # DEPRECATED
def submit_manual_archive(manual: schemas.SubmitManual, auth=Depends(token_api_key_auth)):
raise HTTPException(status_code=410, detail="This endpoint is deprecated. Use /interop/submit-archive instead.")
# result = Metadata.from_json(manual.result)
# logger.info(f"MANUAL SUBMIT {result.get_url()} {manual.author_id}")
# manual.tags.add("manual")
# try:
# # archive_id = insert_result_into_db(result, manual.tags, manual.public, manual.group_id, manual.author_id, models.generate_uuid())
# except sqlalchemy.exc.IntegrityError as e:
# log_error(e)
# raise HTTPException(status_code=422, detail=f"Cannot insert into DB due to integrity error")
# return JSONResponse({"id": archive_id})
return app

17
app/web/middleware.py Normal file
View File

@@ -0,0 +1,17 @@
from loguru import logger
from fastapi import Request
from app.shared.log import log_error
async def logging_middleware(request: Request, call_next):
try:
response = await call_next(request)
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
return response
except Exception as e:
from web.utils.metrics import EXCEPTION_COUNTER
EXCEPTION_COUNTER.labels(type=e.__class__.__name__).inc()
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - {e.__class__.__name__} {e}")
log_error(e)
raise e

83
app/web/security.py Normal file
View File

@@ -0,0 +1,83 @@
from loguru import logger
import requests, secrets
from fastapi import HTTPException, status, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from app.shared.config import ALLOW_ANY_EMAIL
from app.shared.settings import get_settings
from app.shared.db.database import get_db
from app.shared.db.user_state import UserState
settings = get_settings()
bearer_security = HTTPBearer()
def secure_compare(token, api_key):
return secrets.compare_digest(token.encode("utf8"), api_key.encode("utf8"))
# Factory method to create an authentication dependency for a specific key
def api_key_auth(api_key):
assert len(api_key) >= 20, "Invalid API key, must be at least 20 chars"
async def auth(bearer: HTTPAuthorizationCredentials = Depends(bearer_security), auto_error=True):
is_correct = secure_compare(bearer.credentials, api_key)
if is_correct: return True
if auto_error:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Wrong auth credentials",
)
return False
return auth
# --------------------- Token Auth for AA itself to query the API, AA setup tool and Prometheus
token_api_key_auth = api_key_auth(settings.API_BEARER_TOKEN)
async def get_token_or_user_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
# tries to use the static API_KEY and defaults to google JWT auth
if await token_api_key_auth(credentials, auto_error=False): return ALLOW_ANY_EMAIL
return await get_user_auth(credentials)
async def get_user_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
# validates the Bearer token in the case that it requires it
valid_user, info = authenticate_user(credentials.credentials)
if valid_user:
return info.lower()
logger.debug(f"TOKEN FAILURE: {valid_user=} {info=}")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=info,
headers={"WWW-Authenticate": "Bearer"},
)
def authenticate_user(access_token):
# https://cloud.google.com/docs/authentication/token-types#access
if type(access_token) != str or len(access_token) < 10: return False, "invalid access_token"
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token": access_token})
if r.status_code != 200: return False, "invalid token"
try:
j = r.json()
if j.get("azp") not in settings.CHROME_APP_IDS and j.get("aud") not in settings.CHROME_APP_IDS:
return False, f"token does not belong to valid APP_ID"
if j.get("email") in settings.BLOCKED_EMAILS:
return False, f"email '{j.get('email')}' not allowed"
if j.get("email_verified") != "true":
return False, f"email '{j.get('email')}' not verified"
if int(j.get("expires_in", -1)) <= 0:
return False, "Token expired"
return True, j.get('email').lower()
except Exception as e:
logger.warning(f"AUTH EXCEPTION occurred: {e}")
return False, "exception occurred"
def get_user_state(email=Depends(get_user_auth)):
with get_db() as db:
return UserState(db, email)

0
app/web/static/.gitkeep Normal file
View File

BIN
app/web/static/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

View File

69
app/web/utils/metrics.py Normal file
View File

@@ -0,0 +1,69 @@
import asyncio
import json
import os
import shutil
from prometheus_client import Counter, Gauge
from app.shared.db import crud
from app.shared.db.database import get_db
from app.shared.log import log_error
from app.shared.task_messaging import get_redis
# Custom metrics
EXCEPTION_COUNTER = Counter(
"exceptions",
"Number of times a certain exception has occurred.",
labelnames=["type"]
)
WORKER_EXCEPTION = Counter(
"worker_exceptions_total",
"Number of times a certain exception has occurred on the worker.",
labelnames=["type", "exception", "task", "traceback"]
)
DISK_UTILIZATION = Gauge(
"disk_utilization",
"Disk utilization in GB",
labelnames=["type"]
)
DATABASE_METRICS = Gauge(
"database_metrics",
"Database metric readings at a certain point in time",
labelnames=["query"]
)
DATABASE_METRICS_COUNTER = Counter(
"database_metrics_counter",
"Database metrics that increase over time",
labelnames=["query", "user"]
)
async def redis_subscribe_worker_exceptions(REDIS_EXCEPTIONS_CHANNEL: str):
# Subscribe to Redis channel and increment the counter for each exception with info on the exception and task
Redis = get_redis()
PubSubExceptions = Redis.pubsub()
PubSubExceptions.subscribe(REDIS_EXCEPTIONS_CHANNEL)
while True:
message = PubSubExceptions.get_message()
if message and message["type"] == "message":
data = json.loads(message["data"].decode("utf-8"))
WORKER_EXCEPTION.labels(type=data["type"], exception=data["exception"], task=data["task"], traceback=data["traceback"]).inc()
await asyncio.sleep(1)
async def measure_regular_metrics(sqlite_db_url: str, repeat_in_seconds: int):
_total, used, free = shutil.disk_usage("/")
DISK_UTILIZATION.labels(type="used").set(used / (2**30))
DISK_UTILIZATION.labels(type="free").set(free / (2**30))
try:
fs = os.stat(sqlite_db_url.replace("sqlite:///", ""))
DISK_UTILIZATION.labels(type="database").set(fs.st_size / (2**30))
except Exception as e: log_error(e)
with get_db() as db:
DATABASE_METRICS.labels(query="count_archives").set(crud.count_archives(db))
DATABASE_METRICS.labels(query="count_archive_urls").set(crud.count_archive_urls(db))
DATABASE_METRICS.labels(query="count_users").set(crud.count_users(db))
for user in crud.count_by_user_since(db, repeat_in_seconds):
DATABASE_METRICS_COUNTER.labels(query="count_by_user", user=user.author_id).inc(user.total)

7
app/web/utils/misc.py Normal file
View File

@@ -0,0 +1,7 @@
import base64
from fastapi.encoders import jsonable_encoder
def custom_jsonable_encoder(obj):
if isinstance(obj, bytes):
return base64.b64encode(obj).decode('utf-8')
return jsonable_encoder(obj)