Merge pull request #33 from bellingcat/allow-query-before-archive

This commit is contained in:
Miguel Sozinho Ramalho
2023-12-13 10:30:36 +00:00
committed by GitHub
4 changed files with 22 additions and 7 deletions

View File

@@ -23,19 +23,22 @@ def get_task(db: Session, task_id: str, email: str):
return query.first()
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None):
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False):
# searches for partial URLs, if email is * no ownership filtering happens
query = base_query(db)
if email != ALLOW_ANY_EMAIL:
email = email.lower()
groups = get_user_groups(db, email)
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
query = query.filter(models.Archive.url.like(f'%{url}%'))
if absolute_search:
query = query.filter(models.Archive.url == url)
else:
query = query.filter(models.Archive.url.like(f'%{url}%'))
if archived_after:
query = query.filter(models.Archive.created_at >= archived_after)
if archived_before:
query = query.filter(models.Archive.created_at <= archived_before)
return query.offset(skip).limit(limit).all()
return query.order_by(models.Archive.created_at.desc()).offset(skip).limit(limit).all()
def search_tasks_by_email(db: Session, email: str, skip: int = 0, limit: int = 100):

View File

@@ -10,6 +10,7 @@ class ArchiveCreate(BaseModel):
author_id: str | None = None
group_id: str | None = None
tags: set = set()
rearchive: bool = True
# urls: list = []

View File

@@ -1,5 +1,5 @@
from celery.result import AsyncResult
from fastapi import Body, FastAPI, Depends, Request, HTTPException
from fastapi import FastAPI, Depends, Request, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, FileResponse
from fastapi.staticfiles import StaticFiles
@@ -25,7 +25,7 @@ load_dotenv()
# Configuration
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
VERSION = "0.5.5"
VERSION = "0.5.7"
# min-version refers to the version of auto-archiver-extension on the webstore
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
@@ -90,7 +90,7 @@ def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
@app.post("/tasks", status_code=201)
def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth_token_or_jwt)):
archive.author_id = email
url = archive.url
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")

View File

@@ -13,6 +13,8 @@ from db.database import SessionLocal
from contextlib import contextmanager
import json
from security import ALLOW_ANY_EMAIL
celery = Celery(__name__)
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
@@ -26,7 +28,7 @@ def get_db():
finally: session.close()
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 3})
def create_archive_task(self, archive_json: str):
archive = schemas.ArchiveCreate.parse_raw(archive_json)
@@ -36,6 +38,15 @@ def create_archive_task(self, archive_json: str):
url = archive.url
logger.info(f"{url=} {archive=}")
if not archive.rearchive:
with get_db() as session:
archives = crud.search_tasks_by_url(session, url, archive.author_id, absolute_search=True)
if len(archives):
logger.info(f"Skipping {url=} as it was already archived")
# TODO: can we achieve something better than the last result?
return archives[0].result
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
result = orchestrator.feed_item(Metadata().set_url(url))