mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-12 21:48:35 +03:00
Merge pull request #33 from bellingcat/allow-query-before-archive
This commit is contained in:
@@ -23,19 +23,22 @@ def get_task(db: Session, task_id: str, email: str):
|
|||||||
return query.first()
|
return query.first()
|
||||||
|
|
||||||
|
|
||||||
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None):
|
def search_tasks_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False):
|
||||||
# searches for partial URLs, if email is * no ownership filtering happens
|
# searches for partial URLs, if email is * no ownership filtering happens
|
||||||
query = base_query(db)
|
query = base_query(db)
|
||||||
if email != ALLOW_ANY_EMAIL:
|
if email != ALLOW_ANY_EMAIL:
|
||||||
email = email.lower()
|
email = email.lower()
|
||||||
groups = get_user_groups(db, email)
|
groups = get_user_groups(db, email)
|
||||||
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
|
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
|
||||||
query = query.filter(models.Archive.url.like(f'%{url}%'))
|
if absolute_search:
|
||||||
|
query = query.filter(models.Archive.url == url)
|
||||||
|
else:
|
||||||
|
query = query.filter(models.Archive.url.like(f'%{url}%'))
|
||||||
if archived_after:
|
if archived_after:
|
||||||
query = query.filter(models.Archive.created_at >= archived_after)
|
query = query.filter(models.Archive.created_at >= archived_after)
|
||||||
if archived_before:
|
if archived_before:
|
||||||
query = query.filter(models.Archive.created_at <= archived_before)
|
query = query.filter(models.Archive.created_at <= archived_before)
|
||||||
return query.offset(skip).limit(limit).all()
|
return query.order_by(models.Archive.created_at.desc()).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
|
|
||||||
def search_tasks_by_email(db: Session, email: str, skip: int = 0, limit: int = 100):
|
def search_tasks_by_email(db: Session, email: str, skip: int = 0, limit: int = 100):
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ class ArchiveCreate(BaseModel):
|
|||||||
author_id: str | None = None
|
author_id: str | None = None
|
||||||
group_id: str | None = None
|
group_id: str | None = None
|
||||||
tags: set = set()
|
tags: set = set()
|
||||||
|
rearchive: bool = True
|
||||||
# urls: list = []
|
# urls: list = []
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from celery.result import AsyncResult
|
from celery.result import AsyncResult
|
||||||
from fastapi import Body, FastAPI, Depends, Request, HTTPException
|
from fastapi import FastAPI, Depends, Request, HTTPException
|
||||||
from fastapi.encoders import jsonable_encoder
|
from fastapi.encoders import jsonable_encoder
|
||||||
from fastapi.responses import JSONResponse, FileResponse
|
from fastapi.responses import JSONResponse, FileResponse
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
@@ -25,7 +25,7 @@ load_dotenv()
|
|||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
||||||
VERSION = "0.5.5"
|
VERSION = "0.5.7"
|
||||||
|
|
||||||
# min-version refers to the version of auto-archiver-extension on the webstore
|
# min-version refers to the version of auto-archiver-extension on the webstore
|
||||||
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
|
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
|
||||||
@@ -90,7 +90,7 @@ def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email
|
|||||||
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
|
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
|
||||||
|
|
||||||
@app.post("/tasks", status_code=201)
|
@app.post("/tasks", status_code=201)
|
||||||
def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
|
def archive_tasks(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth_token_or_jwt)):
|
||||||
archive.author_id = email
|
archive.author_id = email
|
||||||
url = archive.url
|
url = archive.url
|
||||||
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
|
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ from db.database import SessionLocal
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from security import ALLOW_ANY_EMAIL
|
||||||
|
|
||||||
celery = Celery(__name__)
|
celery = Celery(__name__)
|
||||||
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
||||||
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
||||||
@@ -26,7 +28,7 @@ def get_db():
|
|||||||
finally: session.close()
|
finally: session.close()
|
||||||
|
|
||||||
|
|
||||||
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
|
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 3})
|
||||||
def create_archive_task(self, archive_json: str):
|
def create_archive_task(self, archive_json: str):
|
||||||
archive = schemas.ArchiveCreate.parse_raw(archive_json)
|
archive = schemas.ArchiveCreate.parse_raw(archive_json)
|
||||||
|
|
||||||
@@ -36,6 +38,15 @@ def create_archive_task(self, archive_json: str):
|
|||||||
|
|
||||||
url = archive.url
|
url = archive.url
|
||||||
logger.info(f"{url=} {archive=}")
|
logger.info(f"{url=} {archive=}")
|
||||||
|
|
||||||
|
if not archive.rearchive:
|
||||||
|
with get_db() as session:
|
||||||
|
archives = crud.search_tasks_by_url(session, url, archive.author_id, absolute_search=True)
|
||||||
|
if len(archives):
|
||||||
|
logger.info(f"Skipping {url=} as it was already archived")
|
||||||
|
# TODO: can we achieve something better than the last result?
|
||||||
|
return archives[0].result
|
||||||
|
|
||||||
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
|
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
|
||||||
result = orchestrator.feed_item(Metadata().set_url(url))
|
result = orchestrator.feed_item(Metadata().set_url(url))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user