adds logic to test if archive is needed, if specified by the user

This commit is contained in:
msramalho
2023-12-12 19:14:10 +00:00
parent 3ab5477e6c
commit 6874d123eb
3 changed files with 18 additions and 3 deletions

View File

@@ -23,14 +23,17 @@ def get_task(db: Session, task_id: str, email: str):
return query.first()
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None):
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False):
# searches for partial URLs, if email is * no ownership filtering happens
query = base_query(db)
if email != ALLOW_ANY_EMAIL:
email = email.lower()
groups = get_user_groups(db, email)
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
query = query.filter(models.Archive.url.like(f'%{url}%'))
if absolute_search:
query = query.filter(models.Archive.url == url)
else:
query = query.filter(models.Archive.url.like(f'%{url}%'))
if archived_after:
query = query.filter(models.Archive.created_at >= archived_after)
if archived_before:

View File

@@ -10,6 +10,7 @@ class ArchiveCreate(BaseModel):
author_id: str | None = None
group_id: str | None = None
tags: set = set()
rearchive: bool = True
# urls: list = []

View File

@@ -13,6 +13,8 @@ from db.database import SessionLocal
from contextlib import contextmanager
import json
from security import ALLOW_ANY_EMAIL
celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
@@ -26,7 +28,7 @@ def get_db():
finally: session.close()
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 3})
def create_archive_task(self, archive_json: str):
archive = schemas.ArchiveCreate.parse_raw(archive_json)
@@ -36,6 +38,15 @@ def create_archive_task(self, archive_json: str):
url = archive.url
logger.info(f"{url=} {archive=}")
if not archive.rearchive:
with get_db() as session:
archives = crud.search_tasks_by_url(session, url, ALLOW_ANY_EMAIL, absolute_search=True)
if len(archives):
logger.info(f"Skipping {url=} as it was already archived")
# TODO: can we achieve something better than the last result?
return archives[0].result
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
result = orchestrator.feed_item(Metadata().set_url(url))