diff --git a/src/db/crud.py b/src/db/crud.py index b3e444a..2484e5d 100644 --- a/src/db/crud.py +++ b/src/db/crud.py @@ -23,19 +23,22 @@ def get_task(db: Session, task_id: str, email: str): return query.first() -def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None): +def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False): # searches for partial URLs, if email is * no ownership filtering happens query = base_query(db) if email != ALLOW_ANY_EMAIL: email = email.lower() groups = get_user_groups(db, email) query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups))) - query = query.filter(models.Archive.url.like(f'%{url}%')) + if absolute_search: + query = query.filter(models.Archive.url == url) + else: + query = query.filter(models.Archive.url.like(f'%{url}%')) if archived_after: query = query.filter(models.Archive.created_at >= archived_after) if archived_before: query = query.filter(models.Archive.created_at <= archived_before) - return query.offset(skip).limit(limit).all() + return query.order_by(models.Archive.created_at.desc()).offset(skip).limit(limit).all() def search_tasks_by_email(db: Session, email: str, skip: int = 0, limit: int = 100): diff --git a/src/db/schemas.py b/src/db/schemas.py index 14b5e81..28823cc 100644 --- a/src/db/schemas.py +++ b/src/db/schemas.py @@ -10,6 +10,7 @@ class ArchiveCreate(BaseModel): author_id: str | None = None group_id: str | None = None tags: set = set() + rearchive: bool = True # urls: list = [] diff --git a/src/main.py b/src/main.py index ac28d30..32d7783 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,5 @@ from celery.result import AsyncResult -from fastapi import Body, FastAPI, Depends, Request, HTTPException +from fastapi import FastAPI, Depends, Request, HTTPException from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse, FileResponse from fastapi.staticfiles import StaticFiles @@ -25,7 +25,7 @@ load_dotenv() # Configuration ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",") -VERSION = "0.5.5" +VERSION = "0.5.7" # min-version refers to the version of auto-archiver-extension on the webstore BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."} @@ -90,7 +90,7 @@ def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email return crud.search_tasks_by_email(db, email, skip=skip, limit=limit) @app.post("/tasks", status_code=201) -def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)): +def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth_token_or_jwt)): archive.author_id = email url = archive.url logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}") diff --git a/src/worker.py b/src/worker.py index b68deb6..8f010a2 100644 --- a/src/worker.py +++ b/src/worker.py @@ -13,6 +13,8 @@ from db.database import SessionLocal from contextlib import contextmanager import json +from security import ALLOW_ANY_EMAIL + celery = Celery(__name__) celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379") celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379") @@ -26,7 +28,7 @@ def get_db(): finally: session.close() -@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5}) +@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 3}) def create_archive_task(self, archive_json: str): archive = schemas.ArchiveCreate.parse_raw(archive_json) @@ -36,6 +38,15 @@ def create_archive_task(self, archive_json: str): url = archive.url logger.info(f"{url=} {archive=}") + + if not archive.rearchive: + with get_db() as session: + archives = crud.search_tasks_by_url(session, url, archive.author_id, absolute_search=True) + if len(archives): + logger.info(f"Skipping {url=} as it was already archived") + # TODO: can we achieve something better than the last result? + return archives[0].result + orchestrator = choose_orchestrator(archive.group_id, archive.author_id) result = orchestrator.feed_item(Metadata().set_url(url))