adjusts limits for deployment performance

This commit is contained in:
msramalho
2026-03-02 15:04:38 +00:00
parent 01fa271415
commit 184d285d0e
5 changed files with 72 additions and 3 deletions

View File

@@ -21,6 +21,8 @@ def make_engine(database_url: str):
pool_size=15, # Increase pool size pool_size=15, # Increase pool size
max_overflow=20, # Allow more temporary connections max_overflow=20, # Allow more temporary connections
pool_recycle=1800, # Recycle connections every 30 minutes pool_recycle=1800, # Recycle connections every 30 minutes
pool_pre_ping=True, # Detect and replace stale connections
pool_timeout=30, # Timeout waiting for a connection from pool
) )
@event.listens_for(engine, "connect") @event.listens_for(engine, "connect")

View File

@@ -12,10 +12,13 @@ def get_celery(name: str = "") -> Celery:
name, name,
broker_url=get_settings().celery_broker_url, broker_url=get_settings().celery_broker_url,
result_backend=get_settings().celery_broker_url, result_backend=get_settings().celery_broker_url,
broker_connection_retry_on_startup=False, broker_connection_retry_on_startup=True,
broker_transport_options={ broker_transport_options={
"queue_order_strategy": "priority", "queue_order_strategy": "priority",
"visibility_timeout": 43200, # 12 hours - must be > longest task time_limit
}, },
result_expires=86400, # expire task results after 24 hours to prevent Redis memory buildup
worker_cancel_long_running_tasks_on_connection_loss=True,
) )

View File

@@ -16,6 +16,14 @@ from app.shared.utils.misc import get_all_urls
from app.worker.worker_log import logger, setup_celery_logger from app.worker.worker_log import logger, setup_celery_logger
# Time limits for tasks (in seconds)
# soft_time_limit raises SoftTimeLimitExceeded inside the task so it can clean up
# time_limit hard-kills the task if soft limit didn't work
SINGLE_URL_SOFT_TIME_LIMIT = 30 * 60 # 30 minutes
SINGLE_URL_HARD_TIME_LIMIT = 35 * 60 # 35 minutes
SHEET_SOFT_TIME_LIMIT = 6 * 60 * 60 # 6 hours
SHEET_HARD_TIME_LIMIT = 6.5 * 60 * 60 # 6.5 hours
settings = get_settings() settings = get_settings()
celery = get_celery("worker") celery = get_celery("worker")
@@ -35,6 +43,10 @@ AA_LOGGER_ID = None
autoretry_for=(Exception,), autoretry_for=(Exception,),
retry_backoff=True, retry_backoff=True,
retry_kwargs={"max_retries": 1}, retry_kwargs={"max_retries": 1},
soft_time_limit=SINGLE_URL_SOFT_TIME_LIMIT,
time_limit=SINGLE_URL_HARD_TIME_LIMIT,
acks_late=True,
reject_on_worker_lost=True,
) )
def create_archive_task(self, archive_json: str): def create_archive_task(self, archive_json: str):
global AA_LOGGER_ID global AA_LOGGER_ID
@@ -43,6 +55,7 @@ def create_archive_task(self, archive_json: str):
# call auto-archiver # call auto-archiver
args = get_orchestrator_args(archive.group_id, False, [archive.url]) args = get_orchestrator_args(archive.group_id, False, [archive.url])
result = None result = None
orchestrator = None
try: try:
orchestrator = ArchivingOrchestrator() orchestrator = ArchivingOrchestrator()
orchestrator.logger_id = AA_LOGGER_ID # ensure single logger orchestrator.logger_id = AA_LOGGER_ID # ensure single logger
@@ -55,6 +68,8 @@ def create_archive_task(self, archive_json: str):
except Exception as e: except Exception as e:
log_error(e, "create_archive_task") log_error(e, "create_archive_task")
raise e raise e
finally:
cleanup_orchestrator(orchestrator)
assert result, f"UNABLE TO archive: {archive.url}" assert result, f"UNABLE TO archive: {archive.url}"
# prepare and insert in DB # prepare and insert in DB
@@ -67,7 +82,14 @@ def create_archive_task(self, archive_json: str):
return archive.result return archive.result
@celery.task(name="create_sheet_task", bind=True) @celery.task(
name="create_sheet_task",
bind=True,
soft_time_limit=SHEET_SOFT_TIME_LIMIT,
time_limit=SHEET_HARD_TIME_LIMIT,
acks_late=True,
reject_on_worker_lost=True,
)
def create_sheet_task(self, sheet_json: str): def create_sheet_task(self, sheet_json: str):
global AA_LOGGER_ID global AA_LOGGER_ID
sheet = schemas.SubmitSheet.model_validate_json(sheet_json) sheet = schemas.SubmitSheet.model_validate_json(sheet_json)
@@ -112,6 +134,8 @@ def create_sheet_task(self, sheet_json: str):
except SystemExit as e: except SystemExit as e:
log_error(e, "create_sheet_task: SystemExit from AA") log_error(e, "create_sheet_task: SystemExit from AA")
finally:
cleanup_orchestrator(orchestrator)
if stats["archived"] > 0: if stats["archived"] > 0:
with get_db() as session: with get_db() as session:
@@ -129,6 +153,19 @@ def create_sheet_task(self, sheet_json: str):
).model_dump() ).model_dump()
def cleanup_orchestrator(orchestrator):
"""
Clean up orchestrator resources to prevent leaks between tasks.
"""
if orchestrator is None:
return
try:
if hasattr(orchestrator, "extractors"):
orchestrator.cleanup()
except Exception as e:
logger.warning(f"Error cleaning up orchestrator: {e}")
def get_orchestrator_args( def get_orchestrator_args(
group_id: str, orchestrator_for_sheet: bool, cli_args: list = None group_id: str, orchestrator_for_sheet: bool, cli_args: list = None
) -> list: ) -> list:

View File

@@ -36,7 +36,11 @@ services:
dockerfile: docker/worker/Dockerfile dockerfile: docker/worker/Dockerfile
restart: always restart: always
env_file: .env.prod env_file: .env.prod
command: celery --app=app.worker.main.celery worker -Q high_priority,low_priority --concurrency=${CONCURRENCY} --max-tasks-per-child=100 -O fair command: celery --app=app.worker.main.celery worker -Q high_priority,low_priority --concurrency=${CONCURRENCY} --max-tasks-per-child=50 -O fair --without-heartbeat --without-mingle
deploy:
resources:
limits:
memory: ${WORKER_MEMORY_LIMIT:-4g}
volumes: volumes:
- ./logs:/aa-api/logs - ./logs:/aa-api/logs
- ./database:/aa-api/database - ./database:/aa-api/database
@@ -57,6 +61,7 @@ services:
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 30s
redis: redis:
init: true init: true
@@ -64,6 +69,10 @@ services:
restart: always restart: always
env_file: .env.prod env_file: .env.prod
command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD} command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD}
deploy:
resources:
limits:
memory: ${REDIS_MEMORY_LIMIT:-1g}
volumes: volumes:
- ./redis/data:/data - ./redis/data:/data
- ./redis/config:/conf - ./redis/config:/conf
@@ -72,3 +81,4 @@ services:
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 10s

View File

@@ -0,0 +1,17 @@
# Memory management - prevent Redis from consuming all system memory
# Adjust maxmemory based on your Digital Ocean droplet size
# For a 4GB droplet, 512MB is reasonable; for 8GB, use 1gb
maxmemory 1536mb
maxmemory-policy allkeys-lru
# Persistence - save snapshots periodically
save 900 1
save 300 10
save 60 10000
# Disable transparent huge pages warning
activedefrag yes
# Connection limits
timeout 300
tcp-keepalive 60