feat: retry lost jobs on startup

This commit is contained in:
Felix Spöttel
2023-01-28 13:41:38 +01:00
parent a7ce71ed33
commit e995b1f2ff
6 changed files with 92 additions and 40 deletions

View File

@@ -1,8 +1,10 @@
from asyncio.log import logger
from typing import Dict, List, Optional
from uuid import UUID
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Path
from pydantic import AnyHttpUrl, BaseModel
from sqlalchemy import or_
from sqlalchemy.orm import Session
import app.shared.db.dtos as dtos
@@ -17,6 +19,15 @@ celery = get_celery_binding()
api_router = APIRouter(prefix="/api/v1", dependencies=[Depends(authenticate_api_key)])
def queue_task(job: models.Job) -> None:
# queue an async transcription task.
# we use a signature here to allow full separation of
# worker processes and dependencies.
transcribe = celery.signature("app.worker.main.transcribe")
# TODO: catch delivery errors.
transcribe.delay(job.id)
@api_router.get("/")
def api_root() -> Dict:
return {}
@@ -99,3 +110,23 @@ def delete_transcript(
app.include_router(api_router)
# TODO:
# we could use `acks_late` to handle this scenario within celery itself.
# the reason this does not work well in our case is that `visibility_timeout`
# needs to be very high since whisper workers can be long running.
# doing this application-side bears the risk of poison pilling the worker though,
# implement a workaround with an acceptable trade-off. (=> retry only once?)
@app.on_event("startup")
def on_startup() -> None:
session = get_session().__next__()
jobs = (
session.query(models.Job)
.filter(or_(models.Job.status == dtos.JobStatus.processing, models.Job.status == dtos.JobStatus.create))
.order_by(models.Job.created_at)
).all()
logger.info(f"Re-queueing {len(jobs)} jobs.")
for job in jobs:
queue_task(job)