From 6874d123ebe852e2a41dbf3634c4efa35f1f6b6f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 12 Dec 2023 19:14:10 +0000
Subject: [PATCH 1/4] adds logic to test if archive is needed, if specified by
 the user

---
 src/db/crud.py    |  7 +++++--
 src/db/schemas.py |  1 +
 src/worker.py     | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/db/crud.py b/src/db/crud.py
index b3e444a..b202a5b 100644
--- a/src/db/crud.py
+++ b/src/db/crud.py
@@ -23,14 +23,17 @@ def get_task(db: Session, task_id: str, email: str):
     return query.first()
 
 
-def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None):
+def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False):
     # searches for partial URLs, if email is * no ownership filtering happens
     query = base_query(db)
     if email != ALLOW_ANY_EMAIL:
         email = email.lower()
         groups = get_user_groups(db, email)
         query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
-    query = query.filter(models.Archive.url.like(f'%{url}%'))
+    if absolute_search:
+        query = query.filter(models.Archive.url == url)
+    else:
+        query = query.filter(models.Archive.url.like(f'%{url}%'))
     if archived_after:
         query = query.filter(models.Archive.created_at >= archived_after)
     if archived_before:
diff --git a/src/db/schemas.py b/src/db/schemas.py
index 14b5e81..28823cc 100644
--- a/src/db/schemas.py
+++ b/src/db/schemas.py
@@ -10,6 +10,7 @@ class ArchiveCreate(BaseModel):
     author_id: str | None = None
     group_id: str | None = None
     tags: set = set()
+    rearchive: bool = True
     # urls: list = []
 
 
diff --git a/src/worker.py b/src/worker.py
index b68deb6..457bc9c 100644
--- a/src/worker.py
+++ b/src/worker.py
@@ -13,6 +13,8 @@ from db.database import SessionLocal
 from contextlib import contextmanager
 import json
 
+from security import ALLOW_ANY_EMAIL
+
 celery = Celery(__name__)
 celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
 celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
@@ -26,7 +28,7 @@ def get_db():
     finally: session.close()
 
 
-@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
+@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 3})
 def create_archive_task(self, archive_json: str):
     archive = schemas.ArchiveCreate.parse_raw(archive_json)
 
@@ -36,6 +38,15 @@ def create_archive_task(self, archive_json: str):
 
     url = archive.url
     logger.info(f"{url=} {archive=}")
+
+    if not archive.rearchive:
+        with get_db() as session:
+            archives = crud.search_tasks_by_url(session, url, ALLOW_ANY_EMAIL, absolute_search=True)
+            if len(archives):
+                logger.info(f"Skipping {url=} as it was already archived")
+                # TODO: can we achieve something better than the last result?
+                return archives[0].result
+
     orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
     result = orchestrator.feed_item(Metadata().set_url(url))
 

From bb4ac31c126dc551a922b59863ea13d9580acc2b Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 12 Dec 2023 19:17:24 +0000
Subject: [PATCH 2/4] version updated

---
 src/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.py b/src/main.py
index 405e328..651f210 100644
--- a/src/main.py
+++ b/src/main.py
@@ -25,7 +25,7 @@ load_dotenv()
 
 # Configuration
 ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
-VERSION = "0.5.5"
+VERSION = "0.5.6"
 
 # min-version refers to the version of auto-archiver-extension on the webstore
 BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}

From 3d4d7979a51b7ff9139b2ab73a674aa499627549 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 12 Dec 2023 22:24:36 +0000
Subject: [PATCH 3/4] fixes data leak

---
 src/main.py   | 6 +++---
 src/worker.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/main.py b/src/main.py
index 651f210..ba73a4e 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,5 +1,5 @@
 from celery.result import AsyncResult
-from fastapi import Body, FastAPI, Depends, Request, HTTPException
+from fastapi import FastAPI, Depends, Request, HTTPException
 from fastapi.encoders import jsonable_encoder
 from fastapi.responses import JSONResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
@@ -25,7 +25,7 @@ load_dotenv()
 
 # Configuration
 ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
-VERSION = "0.5.6"
+VERSION = "0.5.7"
 
 # min-version refers to the version of auto-archiver-extension on the webstore
 BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
@@ -90,7 +90,7 @@ def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email
     return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
 
 @app.post("/tasks", status_code=201)
-def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
+def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth_token_or_jwt)):
     archive.author_id = email
     url = archive.url
     logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
diff --git a/src/worker.py b/src/worker.py
index 457bc9c..8f010a2 100644
--- a/src/worker.py
+++ b/src/worker.py
@@ -41,7 +41,7 @@ def create_archive_task(self, archive_json: str):
 
     if not archive.rearchive:
         with get_db() as session:
-            archives = crud.search_tasks_by_url(session, url, ALLOW_ANY_EMAIL, absolute_search=True)
+            archives = crud.search_tasks_by_url(session, url, archive.author_id, absolute_search=True)
             if len(archives):
                 logger.info(f"Skipping {url=} as it was already archived")
                 # TODO: can we achieve something better than the last result?

From 99acfb113f94b13cf492fda0f8b008021060060a Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 12 Dec 2023 22:43:31 +0000
Subject: [PATCH 4/4] most recent first

---
 src/db/crud.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/db/crud.py b/src/db/crud.py
index b202a5b..2484e5d 100644
--- a/src/db/crud.py
+++ b/src/db/crud.py
@@ -38,7 +38,7 @@ def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit:
         query = query.filter(models.Archive.created_at >= archived_after)
     if archived_before:
         query = query.filter(models.Archive.created_at <= archived_before)
-    return query.offset(skip).limit(limit).all()
+    return query.order_by(models.Archive.created_at.desc()).offset(skip).limit(limit).all()
 
 
 def search_tasks_by_email(db: Session, email: str, skip: int = 0, limit: int = 100):