mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-13 05:58:35 +03:00
closes TODOs, renames task to archive, fixes read permissions and fixes tests
This commit is contained in:
@@ -32,20 +32,18 @@ def base_query(db: Session):
|
||||
.options(load_only(models.Archive.id, models.Archive.created_at, models.Archive.url, models.Archive.result, models.Archive.store_until))
|
||||
|
||||
|
||||
def get_archive(db: Session, id: str, email: str):
|
||||
query = base_query(db).filter(models.Archive.id == id)
|
||||
if email != ALLOW_ANY_EMAIL:
|
||||
groups = get_user_group_names(db ,email)
|
||||
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
|
||||
return query.first()
|
||||
|
||||
|
||||
def search_archives_by_url(db: Session, url: str, email: str, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False) -> list[models.Archive]:
|
||||
# searches for partial URLs, if email is * no ownership filtering happens
|
||||
def search_archives_by_url(db: Session, url: str, email: str, read_groups: bool | set[str], read_public: bool, skip: int = 0, limit: int = 100, archived_after: datetime = None, archived_before: datetime = None, absolute_search: bool = False) -> list[models.Archive]:
|
||||
# searches for partial URLs, if email is * no ownership (or read/read_public) filtering happens
|
||||
query = base_query(db)
|
||||
if email != ALLOW_ANY_EMAIL:
|
||||
groups = get_user_group_names(db, email)
|
||||
query = query.filter(or_(models.Archive.public == True, models.Archive.author_id == email, models.Archive.group_id.in_(groups)))
|
||||
or_filters = [models.Archive.author_id == email]
|
||||
if read_public:
|
||||
or_filters.append(models.Archive.public == True)
|
||||
if read_groups == True:
|
||||
or_filters.append(models.Archive.group_id.isnot(None))
|
||||
else:
|
||||
or_filters.append(models.Archive.group_id.in_(read_groups))
|
||||
query = query.filter(or_(*or_filters))
|
||||
if absolute_search:
|
||||
query = query.filter(models.Archive.url == url)
|
||||
else:
|
||||
@@ -61,13 +59,13 @@ def search_archives_by_email(db: Session, email: str, skip: int = 0, limit: int
|
||||
return base_query(db).filter(models.Archive.author_id == email).order_by(models.Archive.created_at.desc()).offset(skip).limit(get_limit(limit)).all()
|
||||
|
||||
|
||||
def soft_delete_task(db: Session, task_id: str, email: str) -> bool:
|
||||
def soft_delete_archive(db: Session, id: str, email: str) -> bool:
|
||||
# TODO: implement hard-delete with cronjob that deletes from S3
|
||||
db_task = db.query(models.Archive).filter(models.Archive.id == task_id, models.Archive.author_id == email, models.Archive.deleted == False).first()
|
||||
if db_task:
|
||||
db_task.deleted = True
|
||||
db_archive = db.query(models.Archive).filter(models.Archive.id == id, models.Archive.author_id == email, models.Archive.deleted == False).first()
|
||||
if db_archive:
|
||||
db_archive.deleted = True
|
||||
db.commit()
|
||||
return db_task is not None
|
||||
return db_archive is not None
|
||||
|
||||
|
||||
def count_archives(db: Session):
|
||||
@@ -121,7 +119,7 @@ def get_user_group_names(db: Session, email: str) -> list[str]:
|
||||
"""
|
||||
given an email retrieves the user groups from the DB and then the email-domain groups from a global variable, the email does not need to belong to an existing user.
|
||||
"""
|
||||
#TODO: the read: [group1, group2] permissions don't currently work
|
||||
# TODO: the read: [group1, group2] permissions don't currently work
|
||||
if not email or not len(email) or "@" not in email: return []
|
||||
|
||||
# get user groups
|
||||
@@ -178,8 +176,8 @@ def upsert_user_groups(db: Session):
|
||||
reads the user_groups yaml file and inserts any new users, groups,
|
||||
along with new participation of users in groups
|
||||
"""
|
||||
logger.debug("Updating user-groups configuration.")
|
||||
filename = get_settings().USER_GROUPS_FILENAME
|
||||
logger.debug(f"Updating user-groups configuration with file {filename}.")
|
||||
|
||||
ug = UserGroups(filename)
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ def delete_sheet(
|
||||
id: str,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency),
|
||||
) -> schemas.TaskDelete:
|
||||
) -> schemas.DeleteResponse:
|
||||
return JSONResponse({
|
||||
"id": id,
|
||||
"deleted": crud.delete_sheet(db, id, user.email)
|
||||
|
||||
@@ -61,22 +61,24 @@ def search_by_url(
|
||||
email: str = Depends(get_token_or_user_auth)
|
||||
) -> list[schemas.ArchiveResult]:
|
||||
|
||||
read_groups, read_public = False, False
|
||||
if email != ALLOW_ANY_EMAIL:
|
||||
user = UserState(db, email)
|
||||
if not user.read and not user.read_public:
|
||||
raise HTTPException(status_code=403, detail="User does not have read access.")
|
||||
|
||||
return crud.search_archives_by_url(db, url.strip(), email, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
|
||||
read_groups = user.read
|
||||
read_public = user.read_public
|
||||
return crud.search_archives_by_url(db, url.strip(), email, read_groups, read_public, skip=skip, limit=limit, archived_after=archived_after, archived_before=archived_before)
|
||||
|
||||
|
||||
@url_router.delete("/{id}", summary="Delete a single URL archive by id.")
|
||||
def delete_task(
|
||||
def delete_archive(
|
||||
id:str,
|
||||
user: UserState = Depends(get_user_state),
|
||||
db: Session = Depends(get_db_dependency)
|
||||
) -> schemas.TaskDelete:
|
||||
) -> schemas.DeleteResponse:
|
||||
logger.info(f"deleting url archive task {id} request by {user.email}")
|
||||
return JSONResponse({
|
||||
"id": id,
|
||||
"deleted": crud.soft_delete_task(db, id, user.email)
|
||||
"deleted": crud.soft_delete_archive(db, id, user.email)
|
||||
})
|
||||
|
||||
@@ -15,6 +15,7 @@ from app.shared import schemas
|
||||
from app.shared.settings import get_settings
|
||||
from app.shared.task_messaging import get_celery
|
||||
from app.web.db import crud
|
||||
from app.web.middleware import increase_exceptions_counter
|
||||
from app.web.utils.metrics import measure_regular_metrics, redis_subscribe_worker_exceptions
|
||||
|
||||
celery = get_celery()
|
||||
@@ -60,17 +61,17 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
|
||||
# CRON JOBS
|
||||
@repeat_every(seconds=get_settings().REPEAT_COUNT_METRICS_SECONDS, on_exception=logger.error)
|
||||
@repeat_every(seconds=get_settings().REPEAT_COUNT_METRICS_SECONDS, on_exception=increase_exceptions_counter)
|
||||
async def repeat_measure_regular_metrics():
|
||||
await measure_regular_metrics(get_settings().DATABASE_PATH, get_settings().REPEAT_COUNT_METRICS_SECONDS)
|
||||
|
||||
|
||||
@repeat_every(seconds=60, wait_first=120, on_exception=logger.error)
|
||||
@repeat_every(seconds=60, wait_first=120, on_exception=increase_exceptions_counter)
|
||||
async def archive_hourly_sheets_cronjob():
|
||||
await archive_sheets_cronjob("hourly", 60, datetime.datetime.now().minute)
|
||||
|
||||
|
||||
@repeat_every(seconds=3600, wait_first=120, on_exception=logger.error)
|
||||
@repeat_every(seconds=3600, wait_first=120, on_exception=increase_exceptions_counter)
|
||||
async def archive_daily_sheets_cronjob():
|
||||
await archive_sheets_cronjob("daily", 24, datetime.datetime.now().hour)
|
||||
|
||||
@@ -92,7 +93,7 @@ async def archive_sheets_cronjob(frequency: str, interval: int, current_time_uni
|
||||
DELETE_WINDOW = get_settings().DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS * 24 * 60 * 60
|
||||
|
||||
|
||||
@repeat_every(seconds=DELETE_WINDOW, wait_first=180, on_exception=logger.error)
|
||||
@repeat_every(seconds=DELETE_WINDOW, wait_first=180, on_exception=increase_exceptions_counter)
|
||||
async def notify_about_expired_archives():
|
||||
notify_from = datetime.datetime.now() + datetime.timedelta(days=get_settings().DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS)
|
||||
async with get_db_async() as db:
|
||||
@@ -135,7 +136,7 @@ async def notify_about_expired_archives():
|
||||
asyncio.create_task(delete_expired_archives())
|
||||
|
||||
|
||||
@repeat_every(max_repetitions=1, wait_first=10, seconds=0, on_exception=logger.error)
|
||||
@repeat_every(max_repetitions=1, wait_first=10, seconds=0, on_exception=increase_exceptions_counter)
|
||||
async def delete_expired_archives():
|
||||
async with get_db_async() as db:
|
||||
count_deleted = await crud.soft_delete_expired_archives(db)
|
||||
@@ -143,7 +144,7 @@ async def delete_expired_archives():
|
||||
logger.debug(f"[CRON] Deleted {count_deleted} archives.")
|
||||
|
||||
|
||||
@repeat_every(seconds=86400, wait_first=150, on_exception=logger.error)
|
||||
@repeat_every(seconds=86400, wait_first=150, on_exception=increase_exceptions_counter)
|
||||
async def delete_stale_sheets():
|
||||
STALE_DAYS = get_settings().DELETE_STALE_SHEETS_DAYS
|
||||
logger.debug(f"[CRON] Deleting stale sheets older than {STALE_DAYS} days.")
|
||||
|
||||
@@ -49,12 +49,12 @@ def app_factory(settings = get_settings()):
|
||||
# prometheus exposed in /metrics with authentication
|
||||
Instrumentator(should_group_status_codes=False, excluded_handlers=["/metrics", "/health", "/openapi.json", "/favicon.ico"]).instrument(app).expose(app, dependencies=[Depends(token_api_key_auth)])
|
||||
|
||||
# TODO: recheck this for security, currently only needed for when local_storage is used in development
|
||||
local_dir = settings.SERVE_LOCAL_ARCHIVE
|
||||
if not os.path.isdir(local_dir) and os.path.isdir(local_dir.replace("/app", ".")):
|
||||
local_dir = local_dir.replace("/app", ".")
|
||||
if len(settings.SERVE_LOCAL_ARCHIVE) > 1 and os.path.isdir(local_dir):
|
||||
logger.warning(f"MOUNTing local archive {settings.SERVE_LOCAL_ARCHIVE}")
|
||||
app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE)
|
||||
if settings.SERVE_LOCAL_ARCHIVE:
|
||||
local_dir = settings.SERVE_LOCAL_ARCHIVE
|
||||
if not os.path.isdir(local_dir) and os.path.isdir(local_dir.replace("/app", ".")):
|
||||
local_dir = local_dir.replace("/app", ".")
|
||||
if len(settings.SERVE_LOCAL_ARCHIVE) > 1 and os.path.isdir(local_dir):
|
||||
logger.warning(f"MOUNTing local archive, use this in development only {settings.SERVE_LOCAL_ARCHIVE}")
|
||||
app.mount(settings.SERVE_LOCAL_ARCHIVE, StaticFiles(directory=local_dir), name=settings.SERVE_LOCAL_ARCHIVE)
|
||||
|
||||
return app
|
||||
@@ -1,4 +1,5 @@
|
||||
|
||||
import traceback
|
||||
from loguru import logger
|
||||
from fastapi import Request
|
||||
from app.shared.log import log_error
|
||||
@@ -13,7 +14,18 @@ async def logging_middleware(request: Request, call_next):
|
||||
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
|
||||
return response
|
||||
except Exception as e:
|
||||
EXCEPTION_COUNTER.labels(type=e.__class__.__name__).inc()
|
||||
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - {e.__class__.__name__} {e}")
|
||||
log_error(e)
|
||||
location = f"{request.method} {request.url._url}"
|
||||
await increase_exceptions_counter(e, location)
|
||||
logger.info(f"{request.client.host}:{request.client.port} {location} - {e.__class__.__name__} {e}")
|
||||
raise e
|
||||
|
||||
async def increase_exceptions_counter(e: Exception, location:str="cronjob"):
|
||||
if location == "cronjob":
|
||||
try:
|
||||
last_trace = traceback.extract_tb(e.__traceback__)[-1]
|
||||
_file, _line, func_name, _text = last_trace
|
||||
location = func_name
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to get function name from cronjob exception traceback: {e}")
|
||||
EXCEPTION_COUNTER.labels(type=e.__class__.__name__, location=location).inc()
|
||||
log_error(e)
|
||||
@@ -14,7 +14,7 @@ from app.shared.task_messaging import get_redis
|
||||
EXCEPTION_COUNTER = Counter(
|
||||
"exceptions",
|
||||
"Number of times a certain exception has occurred.",
|
||||
labelnames=["type"]
|
||||
labelnames=["type", "location"]
|
||||
)
|
||||
WORKER_EXCEPTION = Counter(
|
||||
"worker_exceptions_total",
|
||||
|
||||
Reference in New Issue
Block a user