mirror of
https://github.com/bellingcat/auto-archiver-api.git
synced 2026-06-13 05:58:35 +03:00
Merge pull request #3 from bellingcat/sheet-endpoint
This commit is contained in:
@@ -55,7 +55,13 @@ Copy `.env` and `src/.env` to deployment, along with the contents of `secrets/`
|
|||||||
Then `docker compose up -d`.
|
Then `docker compose up -d`.
|
||||||
|
|
||||||
#### updating packages/app/access
|
#### updating packages/app/access
|
||||||
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
|
If pipenv packages are updated: `pipenv lock --requirements -r > requirements.txt` (or ` pipenv requirements > requirements.txt` depending on pipenv version) (manually comment line `-i https://pypi.org/simple`) and then `docker compose down` + `docker compose up --build -d` to build images with new packages.
|
||||||
|
|
||||||
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop
|
New users should be added to the `src/.env` file `ALLOWED_EMAILS` prop
|
||||||
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CALL /sheet POST endpoint
|
||||||
|
curl -XPOST -H "Authorization: Bearer GOOGLE_OAUTH_TOKEN" -H "Content-type: application/json" -d '{"sheet_id": "SHEET_ID", "header": 1}' 'http://localhost:8004/sheet'
|
||||||
|
|
||||||
|
```
|
||||||
@@ -33,6 +33,7 @@ services:
|
|||||||
|
|
||||||
redis:
|
redis:
|
||||||
image: redis:6-alpine
|
image: redis:6-alpine
|
||||||
|
# command: redis-server /conf/redis.conf # DEV ONLY
|
||||||
command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD}
|
command: redis-server /conf/redis.conf --requirepass ${REDIS_PASSWORD}
|
||||||
volumes:
|
volumes:
|
||||||
- "./redis/data:/data"
|
- "./redis/data:/data"
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
|
GOOGLE_CHROME_APP_ID=0000000000000000000000000000000000.apps.googleusercontent.com
|
||||||
|
# for validating non-email limited JWT
|
||||||
|
GOOGLE_CHROME_APP_ID_PUBLIC=0000000000000000000000000000000000.apps.googleusercontent.com
|
||||||
|
|
||||||
ALLOWED_EMAILS=email1,email2
|
ALLOWED_EMAILS=email1,email2
|
||||||
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml
|
ORCHESTRATION_CONFIG_DEFAULT=secrets/orchestration.yaml
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ aiofiles = "==0.6.0"
|
|||||||
celery = "==4.4.7"
|
celery = "==4.4.7"
|
||||||
fastapi = "*"
|
fastapi = "*"
|
||||||
flower = "==0.9.7"
|
flower = "==0.9.7"
|
||||||
jinja2 = ">=3.0.3"
|
jinja2 = "*"
|
||||||
pytest = "==6.2.4"
|
pytest = "==6.2.4"
|
||||||
redis = "==3.5.3"
|
redis = "==3.5.3"
|
||||||
requests = ">=2.25.1"
|
requests = ">=2.25.1"
|
||||||
|
|||||||
1257
src/Pipfile.lock
generated
1257
src/Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
|||||||
from functools import cache
|
from functools import cache
|
||||||
from sqlalchemy.orm import Session, load_only
|
from sqlalchemy.orm import Session, load_only
|
||||||
from sqlalchemy import Column
|
from sqlalchemy import Column, or_
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from . import models, schemas
|
from . import models, schemas
|
||||||
import yaml
|
import yaml
|
||||||
@@ -13,17 +13,17 @@ def get_task(db: Session, task_id: str):
|
|||||||
def get_tasks(db: Session, skip: int = 0, limit: int = 100):
|
def get_tasks(db: Session, skip: int = 0, limit: int = 100):
|
||||||
return base_query(db).offset(skip).limit(limit).all()
|
return base_query(db).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
def search_tasks_by_url(db: Session, url:str, skip: int = 0, limit: int = 100):
|
def search_tasks_by_url(db: Session, url:str, email:str, skip: int = 0, limit: int = 100):
|
||||||
return base_query(db).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all()
|
groups = get_user_groups(db, email)
|
||||||
|
return base_query(db).filter(or_(models.Archive.public==True, models.Archive.author_id==email, models.Archive.group_id.in_(groups))).filter(models.Archive.url.like(f'%{url}%')).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
def search_tasks_by_email(db: Session, email:str, skip: int = 0, limit: int = 100):
|
def search_tasks_by_email(db: Session, email:str, skip: int = 0, limit: int = 100):
|
||||||
return base_query(db).filter(models.Archive.author.has(email=email)).offset(skip).limit(limit).all()
|
return base_query(db).filter(models.Archive.author.has(email=email)).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
def create_task(db: Session, task: schemas.ArchiveCreate, tags:list[models.Tag],urls:list[models.ArchiveUrl]):
|
def create_task(db: Session, task: schemas.ArchiveCreate, tags:list[models.Tag],urls:list[models.ArchiveUrl]):
|
||||||
db_task = models.Archive(id=task.id, url=task.url, author_id=task.author_id, result=task.result, group_id=task.group_id)
|
db_task = models.Archive(id=task.id, url=task.url, result=task.result, public=task.public, author_id=task.author_id, group_id=task.group_id)
|
||||||
logger.debug(tags)
|
db_task.tags = tags
|
||||||
db_task.tags = tags # will this work? TODO: test if I don't call create tag before
|
db_task.urls = urls
|
||||||
db_task.urls = urls # will this work to create ArchiveUrl? TODO: test
|
|
||||||
db.add(db_task)
|
db.add(db_task)
|
||||||
db.commit()
|
db.commit()
|
||||||
db.refresh(db_task)
|
db.refresh(db_task)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
class ArchiveCreate(BaseModel):
|
class ArchiveCreate(BaseModel):
|
||||||
id: str | None = None
|
id: str | None = None
|
||||||
url: str
|
url: str
|
||||||
@@ -8,11 +9,10 @@ class ArchiveCreate(BaseModel):
|
|||||||
public: bool = True
|
public: bool = True
|
||||||
author_id: str | None = None
|
author_id: str | None = None
|
||||||
group_id: str | None = None
|
group_id: str | None = None
|
||||||
tags: list = []
|
tags: set = set()
|
||||||
# urls: list = []
|
# urls: list = []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Archive(ArchiveCreate):
|
class Archive(ArchiveCreate):
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
updated_at: datetime | None
|
updated_at: datetime | None
|
||||||
@@ -22,10 +22,12 @@ class Archive(ArchiveCreate):
|
|||||||
orm_mode = True
|
orm_mode = True
|
||||||
|
|
||||||
|
|
||||||
# class TagCreate(BaseModel):
|
class SubmitSheet(BaseModel):
|
||||||
# id: str
|
sheet_name: str | None = None
|
||||||
|
sheet_id: str | None = None
|
||||||
# class Tag(TagCreate):
|
header: int = 1
|
||||||
# created_at: datetime
|
public: bool = False
|
||||||
# # class Config:
|
author_id: str | None = None
|
||||||
# # orm_mode = True
|
group_id: str | None = None
|
||||||
|
tags: set | None = set()
|
||||||
|
columns: dict | None = {} # TODO: implement
|
||||||
|
|||||||
67
src/main.py
67
src/main.py
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
|
|||||||
import traceback, os, logging
|
import traceback, os, logging
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from worker import create_archive_task, celery
|
from worker import create_archive_task, create_sheet_task, celery
|
||||||
|
|
||||||
from db import crud, models, schemas
|
from db import crud, models, schemas
|
||||||
from db.database import engine, SessionLocal
|
from db.database import engine, SessionLocal
|
||||||
@@ -21,11 +21,12 @@ load_dotenv()
|
|||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
ALLOWED_ORIGINS = os.environ.get("ALLOWED_ORIGINS", "chrome-extension://ondkcheoicfckabcnkdgbepofpjmjcmb,chrome-extension://ojcimmjndnlmmlgnjaeojoebaceokpdp").split(",")
|
||||||
VERSION = "0.4.0"
|
VERSION = "0.5.0"
|
||||||
# min-version refers to the version of auto-archiver-extension on the webstore
|
|
||||||
BREAKING_CHANGES = {"minVersion": "0.3.0", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
|
|
||||||
|
|
||||||
app = FastAPI()
|
# min-version refers to the version of auto-archiver-extension on the webstore
|
||||||
|
BREAKING_CHANGES = {"minVersion": "0.3.1", "message": "The latest update has breaking changes, please update the extension to the most recent version."}
|
||||||
|
|
||||||
|
app = FastAPI(title="Auto-Archiver API", version=VERSION, contact={"name":"Bellingcat", "url":"https://github.com/bellingcat/auto-archiver-api"})
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
allow_origins=ALLOWED_ORIGINS,
|
allow_origins=ALLOWED_ORIGINS,
|
||||||
@@ -39,7 +40,14 @@ def get_db():
|
|||||||
session = SessionLocal()
|
session = SessionLocal()
|
||||||
try: yield session
|
try: yield session
|
||||||
finally: session.close()
|
finally: session.close()
|
||||||
|
|
||||||
|
# logging configurations
|
||||||
|
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
|
||||||
|
@app.middleware("http")
|
||||||
|
async def logging_middleware(request: Request, call_next):
|
||||||
|
response = await call_next(request)
|
||||||
|
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
|
||||||
|
return response
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def home(request: Request):
|
async def home(request: Request):
|
||||||
@@ -53,14 +61,6 @@ async def home(request: Request):
|
|||||||
except Exception as e: logger.error(e)
|
except Exception as e: logger.error(e)
|
||||||
return JSONResponse(status)
|
return JSONResponse(status)
|
||||||
|
|
||||||
# logging configurations
|
|
||||||
logger.add("logs/api_logs.log", retention="30 days", rotation="3 days")
|
|
||||||
@app.middleware("http")
|
|
||||||
async def logging_middleware(request: Request, call_next):
|
|
||||||
response = await call_next(request)
|
|
||||||
logger.info(f"{request.client.host}:{request.client.port} {request.method} {request.url._url} - HTTP {response.status_code}")
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
# Bearer protected below
|
# Bearer protected below
|
||||||
|
|
||||||
@@ -69,22 +69,17 @@ def get_user_groups(db: Session = Depends(get_db), email = Depends(get_bearer_au
|
|||||||
return crud.get_user_groups(db, email)
|
return crud.get_user_groups(db, email)
|
||||||
|
|
||||||
@app.get("/tasks/search-url", response_model=list[schemas.Archive])
|
@app.get("/tasks/search-url", response_model=list[schemas.Archive])
|
||||||
def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), _email = Depends(get_bearer_auth)):
|
def search(url:str, skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
||||||
return crud.search_tasks_by_url(db, url, skip=skip, limit=limit)
|
return crud.search_tasks_by_url(db, url, email, skip=skip, limit=limit)
|
||||||
|
|
||||||
# @app.get("/tasks/search", response_model=list[schemas.Task])
|
|
||||||
# def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
|
||||||
# return crud.get_tasks(db, skip=skip, limit=limit)
|
|
||||||
|
|
||||||
@app.get("/tasks/sync", response_model=list[schemas.Archive])
|
@app.get("/tasks/sync", response_model=list[schemas.Archive])
|
||||||
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
def search(skip: int = 0, limit: int = 100, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
||||||
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
|
return crud.search_tasks_by_email(db, email, skip=skip, limit=limit)
|
||||||
|
|
||||||
@app.post("/tasks", status_code=201)
|
@app.post("/tasks", status_code=201)
|
||||||
def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
|
def archive_sheet(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
|
||||||
archive.author_id = email
|
archive.author_id = email
|
||||||
url = archive.url
|
url = archive.url
|
||||||
logger.warning(archive)
|
|
||||||
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
|
logger.info(f"new {archive.public=} task for {email=} and {archive.group_id=}: {url}")
|
||||||
if type(url)!=str or len(url)<=5:
|
if type(url)!=str or len(url)<=5:
|
||||||
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
|
raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
|
||||||
@@ -92,22 +87,10 @@ def run_task(archive:schemas.ArchiveCreate, email = Depends(get_bearer_auth)):
|
|||||||
task = create_archive_task.delay(archive.json())
|
task = create_archive_task.delay(archive.json())
|
||||||
return JSONResponse({"id": task.id})
|
return JSONResponse({"id": task.id})
|
||||||
|
|
||||||
# @app.post("/tasks", status_code=201)
|
|
||||||
# def run_task(payload = Body(...), email = Depends(get_bearer_auth)):
|
|
||||||
# url = payload.get('url')
|
|
||||||
# public = payload.get('public', True)
|
|
||||||
# group = payload.get('group', None)
|
|
||||||
# logger.info(f"new {public=} task for {email=} and {group=}: {url}")
|
|
||||||
# if type(url)!=str or len(url)<=5:
|
|
||||||
# raise HTTPException(status_code=422, detail=f"Invalid URL received: {url}")
|
|
||||||
# task = create_archive_task.delay(url=payload.get('url'), email=email, public=public, group=group)
|
|
||||||
# return JSONResponse({"id": task.id})
|
|
||||||
|
|
||||||
@app.get("/tasks/{task_id}")
|
@app.get("/tasks/{task_id}")
|
||||||
def get_status(task_id, email = Depends(get_bearer_auth)):
|
def get_status(task_id, email = Depends(get_bearer_auth)):
|
||||||
logger.info(f"status check for user {email}")
|
logger.info(f"status check for user {email}")
|
||||||
task_result = AsyncResult(task_id, app=celery)
|
task_result = AsyncResult(task_id, app=celery)
|
||||||
logger.info(task_result)
|
|
||||||
result = {
|
result = {
|
||||||
"id": task_id,
|
"id": task_id,
|
||||||
"status": task_result.status,
|
"status": task_result.status,
|
||||||
@@ -116,7 +99,10 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
|
|||||||
try:
|
try:
|
||||||
if task_result.result and "error" in task_result.result:
|
if task_result.result and "error" in task_result.result:
|
||||||
result["status"] = "FAILURE"
|
result["status"] = "FAILURE"
|
||||||
except Exception as e: logger.error(traceback.format_exc())
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
result["status"] = "FAILURE"
|
||||||
try:
|
try:
|
||||||
json_result = jsonable_encoder(result, exclude_unset=True)
|
json_result = jsonable_encoder(result, exclude_unset=True)
|
||||||
return JSONResponse(json_result)
|
return JSONResponse(json_result)
|
||||||
@@ -131,13 +117,22 @@ def get_status(task_id, email = Depends(get_bearer_auth)):
|
|||||||
|
|
||||||
|
|
||||||
@app.delete("/tasks/{task_id}")
|
@app.delete("/tasks/{task_id}")
|
||||||
def get_status(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
def delete_task(task_id, db: Session = Depends(get_db), email = Depends(get_bearer_auth)):
|
||||||
logger.info(f"deleting task {task_id} request by {email}")
|
logger.info(f"deleting task {task_id} request by {email}")
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"id": task_id,
|
"id": task_id,
|
||||||
"deleted": crud.soft_delete_task(db, task_id, email)
|
"deleted": crud.soft_delete_task(db, task_id, email)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@app.post("/sheet", status_code=201)
|
||||||
|
def archive_sheet(sheet:schemas.SubmitSheet, email = Depends(get_bearer_auth)):
|
||||||
|
logger.info(f"SHEET TASK for {sheet=}")
|
||||||
|
sheet.author_id = email
|
||||||
|
if not sheet.sheet_name and not sheet.sheet_id:
|
||||||
|
raise HTTPException(status_code=422, detail=f"sheet name or id is required")
|
||||||
|
task = create_sheet_task.delay(sheet.json())
|
||||||
|
return JSONResponse({"id": task.id})
|
||||||
|
|
||||||
# Basic protected logic to allow access to 1 static file
|
# Basic protected logic to allow access to 1 static file
|
||||||
SF = os.environ.get("STATIC_FILE", "")
|
SF = os.environ.get("STATIC_FILE", "")
|
||||||
if len(SF) > 1 and os.path.isfile(SF):
|
if len(SF) > 1 and os.path.isfile(SF):
|
||||||
|
|||||||
@@ -1,133 +1,134 @@
|
|||||||
#
|
|
||||||
# These requirements were autogenerated by pipenv
|
|
||||||
# To regenerate from the project's Pipfile, run:
|
|
||||||
#
|
|
||||||
# pipenv lock --requirements
|
|
||||||
#
|
|
||||||
|
|
||||||
# -i https://pypi.org/simple
|
# -i https://pypi.org/simple
|
||||||
aiofiles==0.6.0
|
aiofiles==0.6.0
|
||||||
|
aiohttp==3.8.4 ; python_version >= '3.6'
|
||||||
|
aiosignal==1.3.1 ; python_version >= '3.7'
|
||||||
aiosqlite==0.19.0
|
aiosqlite==0.19.0
|
||||||
alembic==1.10.3
|
alembic==1.11.1
|
||||||
amqp==2.6.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
amqp==2.6.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
anyio==3.6.2; python_full_version >= '3.6.2'
|
anyio==3.6.2 ; python_full_version >= '3.6.2'
|
||||||
argparse==1.4.0
|
argparse==1.4.0
|
||||||
async-generator==1.10; python_version >= '3.5'
|
async-generator==1.10 ; python_version >= '3.5'
|
||||||
attrs==23.1.0; python_version >= '3.7'
|
async-timeout==4.0.2 ; python_version >= '3.6'
|
||||||
|
attrs==23.1.0 ; python_version >= '3.7'
|
||||||
authlib==0.15.6
|
authlib==0.15.6
|
||||||
auto-archiver==0.5.6
|
auto-archiver==0.5.17
|
||||||
beautifulsoup4==4.12.2; python_version >= '3.6'
|
beautifulsoup4==4.12.2 ; python_full_version >= '3.6.0'
|
||||||
billiard==3.6.4.0
|
billiard==3.6.4.0
|
||||||
boto3==1.26.115; python_version >= '3.7'
|
blinker==1.6.2 ; python_version >= '3.7'
|
||||||
botocore==1.29.115; python_version >= '3.7'
|
boto3==1.26.138 ; python_version >= '3.7'
|
||||||
brotli==1.0.9; platform_python_implementation == 'CPython'
|
botocore==1.29.138 ; python_version >= '3.7'
|
||||||
|
brotli==1.0.9 ; platform_python_implementation == 'CPython'
|
||||||
bs4==0.0.1
|
bs4==0.0.1
|
||||||
cachetools==5.3.0; python_version ~= '3.7'
|
cachetools==5.3.0 ; python_version ~= '3.7'
|
||||||
celery==4.4.7
|
celery==4.4.7
|
||||||
certifi==2022.12.7; python_version >= '3.6'
|
certifi==2023.5.7 ; python_version >= '3.6'
|
||||||
cffi==1.15.1
|
cffi==1.15.1
|
||||||
charset-normalizer==3.0.1
|
charset-normalizer==3.1.0 ; python_full_version >= '3.7.0'
|
||||||
click==8.1.3; python_version >= '3.7'
|
click==8.1.3 ; python_version >= '3.7'
|
||||||
cloudscraper==1.2.69
|
cloudscraper==1.2.71
|
||||||
cryptography==38.0.4; python_version >= '3.6'
|
cryptography==38.0.4 ; python_version >= '3.6'
|
||||||
dataclasses-json==0.5.7; python_version >= '3.6'
|
dataclasses-json==0.5.7 ; python_version >= '3.6'
|
||||||
dateparser==1.1.8; python_version >= '3.7'
|
dateparser==1.1.8 ; python_version >= '3.7'
|
||||||
exceptiongroup==1.1.1; python_version < '3.11'
|
exceptiongroup==1.1.1 ; python_version < '3.11'
|
||||||
|
fastapi==0.95.2
|
||||||
fastapi-utils==0.2.1
|
fastapi-utils==0.2.1
|
||||||
fastapi==0.95.1
|
|
||||||
ffmpeg-python==0.2.0
|
ffmpeg-python==0.2.0
|
||||||
filelock==3.12.0; python_version >= '3.7'
|
filelock==3.12.0 ; python_version >= '3.7'
|
||||||
flask==2.2.3; python_version >= '3.7'
|
flask==2.3.2 ; python_version >= '3.8'
|
||||||
flower==0.9.7
|
flower==0.9.7
|
||||||
future==0.18.3; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
frozenlist==1.3.3 ; python_version >= '3.7'
|
||||||
google-api-core==2.11.0; python_version >= '3.7'
|
future==0.18.3 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
google-api-python-client==2.86.0; python_version >= '3.7'
|
google-api-core==2.11.0 ; python_version >= '3.7'
|
||||||
|
google-api-python-client==2.86.0 ; python_version >= '3.7'
|
||||||
|
google-auth==2.18.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
google-auth-httplib2==0.1.0
|
google-auth-httplib2==0.1.0
|
||||||
google-auth-oauthlib==1.0.0; python_version >= '3.6'
|
google-auth-oauthlib==1.0.0 ; python_version >= '3.6'
|
||||||
google-auth==2.17.3; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
googleapis-common-protos==1.59.0 ; python_version >= '3.7'
|
||||||
googleapis-common-protos==1.59.0; python_version >= '3.7'
|
greenlet==2.0.2 ; python_version >= '3' and (platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32'))))))
|
||||||
greenlet==2.0.2; python_version >= '3' and platform_machine == 'aarch64' or (platform_machine == 'ppc64le' or (platform_machine == 'x86_64' or (platform_machine == 'amd64' or (platform_machine == 'AMD64' or (platform_machine == 'win32' or platform_machine == 'WIN32')))))
|
gspread==5.9.0 ; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6'
|
||||||
gspread==5.8.0; python_version not in '3.0, 3.1, 3.2, 3.3' and python_version >= '3.6'
|
h11==0.14.0 ; python_version >= '3.7'
|
||||||
h11==0.14.0; python_version >= '3.7'
|
httpcore==0.17.2 ; python_version >= '3.7'
|
||||||
httplib2==0.22.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
httplib2==0.22.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
humanize==4.6.0; python_version >= '3.7'
|
httpx==0.24.1 ; python_version >= '3.7'
|
||||||
idna==3.4; python_version >= '3.5'
|
humanize==4.6.0 ; python_version >= '3.7'
|
||||||
iniconfig==2.0.0; python_version >= '3.7'
|
idna==3.4 ; python_version >= '3.5'
|
||||||
instaloader==4.9.6; python_version >= '3.8'
|
iniconfig==2.0.0 ; python_version >= '3.7'
|
||||||
itsdangerous==2.1.2; python_version >= '3.7'
|
instaloader==4.9.6 ; python_version >= '3.8'
|
||||||
|
itsdangerous==2.1.2 ; python_version >= '3.7'
|
||||||
jinja2==3.1.2
|
jinja2==3.1.2
|
||||||
jmespath==1.0.1; python_version >= '3.7'
|
jmespath==1.0.1 ; python_version >= '3.7'
|
||||||
kombu==4.6.11; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
kombu==4.6.11 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
loguru==0.7.0
|
loguru==0.7.0
|
||||||
lxml==4.9.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
lxml==4.9.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
mako==1.2.4; python_version >= '3.7'
|
mako==1.2.4 ; python_version >= '3.7'
|
||||||
markdown-it-py==2.2.0; python_version >= '3.7'
|
markdown-it-py==2.2.0 ; python_version >= '3.7'
|
||||||
markupsafe==2.1.2; python_version >= '3.7'
|
markupsafe==2.1.2 ; python_version >= '3.7'
|
||||||
|
marshmallow==3.19.0 ; python_version >= '3.7'
|
||||||
marshmallow-enum==1.5.1
|
marshmallow-enum==1.5.1
|
||||||
marshmallow==3.19.0; python_version >= '3.7'
|
mdurl==0.1.2 ; python_version >= '3.7'
|
||||||
mdurl==0.1.2; python_version >= '3.7'
|
multidict==6.0.4 ; python_version >= '3.7'
|
||||||
mutagen==1.46.0; python_version >= '3.7'
|
mutagen==1.46.0 ; python_version >= '3.7'
|
||||||
mypy-extensions==1.0.0; python_version >= '3.5'
|
mypy-extensions==1.0.0 ; python_version >= '3.5'
|
||||||
oauth2client==4.1.3
|
oauth2client==4.1.3
|
||||||
oauthlib==3.2.2; python_version >= '3.6'
|
oauthlib==3.2.2 ; python_version >= '3.6'
|
||||||
outcome==1.2.0; python_version >= '3.7'
|
outcome==1.2.0 ; python_version >= '3.7'
|
||||||
packaging==23.1; python_version >= '3.7'
|
packaging==23.1 ; python_version >= '3.7'
|
||||||
pluggy==0.13.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
pluggy==0.13.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
prometheus-client==0.8.0
|
prometheus-client==0.8.0
|
||||||
protobuf==4.22.3; python_version >= '3.7'
|
protobuf==4.23.1 ; python_version >= '3.7'
|
||||||
py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
py==1.11.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
pyaes==1.6.1
|
pyaes==1.6.1
|
||||||
pyasn1-modules==0.2.8
|
pyasn1==0.5.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
pyasn1==0.4.8
|
pyasn1-modules==0.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
pycparser==2.21
|
pycparser==2.21
|
||||||
pycryptodomex==3.17; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
pycryptodomex==3.18.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
|
||||||
pydantic==1.10.7; python_version >= '3.7'
|
pydantic==1.10.7 ; python_version >= '3.7'
|
||||||
pygments==2.15.0; python_version >= '3.7'
|
pygments==2.15.1 ; python_version >= '3.7'
|
||||||
pyparsing==3.0.9; python_version >= '3.1'
|
pyparsing==3.0.9 ; python_version >= '3.1'
|
||||||
pysocks==1.7.1
|
pysocks==1.7.1
|
||||||
pytest==6.2.4
|
pytest==6.2.4
|
||||||
python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
python-dateutil==2.8.2 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
python-slugify==8.0.1; python_version >= '3.7'
|
python-slugify==8.0.1 ; python_version >= '3.7'
|
||||||
python-twitter-v2==0.8.1; python_version >= '3.6' and python_version < '4'
|
python-twitter-v2==0.8.1 ; python_version >= '3.6' and python_version < '4.0'
|
||||||
pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
|
||||||
pytz==2023.3
|
pytz==2023.3
|
||||||
pyyaml==6.0; python_version >= '3.6'
|
pyyaml==6.0 ; python_version >= '3.6'
|
||||||
redis==3.5.3
|
redis==3.5.3
|
||||||
regex==2023.3.23; python_version >= '3.8'
|
regex==2023.5.5 ; python_version >= '3.6'
|
||||||
requests-oauthlib==1.3.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
requests==2.31.0
|
||||||
requests-toolbelt==0.10.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
requests-oauthlib==1.3.1 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
requests==2.28.2
|
requests-toolbelt==1.0.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
rich==13.3.4; python_version >= '3.7'
|
rich==13.3.5 ; python_full_version >= '3.7.0'
|
||||||
rsa==4.9; python_version >= '3.6' and python_version < '4'
|
rsa==4.9 ; python_version >= '3.6' and python_version < '4'
|
||||||
s3transfer==0.6.0; python_version >= '3.7'
|
s3transfer==0.6.1 ; python_version >= '3.7'
|
||||||
selenium==4.8.3; python_version >= '3.7'
|
selenium==4.9.1 ; python_version >= '3.7'
|
||||||
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
six==1.16.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
sniffio==1.3.0; python_version >= '3.7'
|
sniffio==1.3.0 ; python_version >= '3.7'
|
||||||
snscrape==0.6.2.20230320; python_version ~= '3.8'
|
snscrape==0.6.2.20230320 ; python_version ~= '3.8'
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
soupsieve==2.4.1; python_version >= '3.7'
|
soupsieve==2.4.1 ; python_version >= '3.7'
|
||||||
sqlalchemy==1.4.47
|
sqlalchemy==1.4.48
|
||||||
starlette==0.26.1; python_version >= '3.7'
|
starlette==0.27.0 ; python_version >= '3.7'
|
||||||
telethon==1.28.5; python_version >= '3.5'
|
telethon==1.28.5 ; python_version >= '3.5'
|
||||||
text-unidecode==1.3
|
text-unidecode==1.3
|
||||||
tiktok-downloader==0.3.4
|
tiktok-downloader==0.3.5
|
||||||
toml==0.10.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
toml==0.10.2 ; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2'
|
||||||
tornado==6.3; python_full_version >= '3.5.2'
|
tornado==6.3.2 ; python_version >= '3.5.2'
|
||||||
tqdm==4.65.0; python_version >= '3.7'
|
tqdm==4.65.0 ; python_version >= '3.7'
|
||||||
trio-websocket==0.10.2; python_version >= '3.7'
|
trio==0.22.0 ; python_version >= '3.7'
|
||||||
trio==0.22.0; python_version >= '3.7'
|
trio-websocket==0.10.2 ; python_version >= '3.7'
|
||||||
typing-extensions==4.5.0; python_version >= '3.7'
|
typing-extensions==4.6.0 ; python_version >= '3.7'
|
||||||
typing-inspect==0.8.0
|
typing-inspect==0.8.0
|
||||||
tzdata==2023.3; python_version >= '3.6'
|
tzlocal==5.0.1 ; python_version >= '3.7'
|
||||||
tzlocal==4.3; python_version >= '3.7'
|
uritemplate==4.1.1 ; python_version >= '3.6'
|
||||||
uritemplate==4.1.1; python_version >= '3.6'
|
urllib3==1.26.16 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
||||||
urllib3==1.26.14; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
|
uvicorn==0.22.0
|
||||||
uvicorn==0.21.1
|
uwsgi==2.0.21
|
||||||
vine==1.3.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
vine==1.3.0 ; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
|
||||||
vk-api==11.9.9
|
vk-api==11.9.9
|
||||||
vk-url-scraper==0.3.15; python_version >= '3.7'
|
vk-url-scraper==0.3.24 ; python_version >= '3.7'
|
||||||
websockets==10.4; python_version >= '3.7'
|
websockets==11.0.3 ; python_version >= '3.7'
|
||||||
werkzeug==2.2.3; python_version >= '3.7'
|
werkzeug==2.3.4 ; python_version >= '3.8'
|
||||||
wsproto==1.2.0; python_version >= '3.7'
|
wsproto==1.2.0 ; python_full_version >= '3.7.0'
|
||||||
yt-dlp==2023.2.17; python_version >= '3.7'
|
yarl==1.9.2 ; python_version >= '3.7'
|
||||||
|
yt-dlp==2023.3.4 ; python_version >= '3.7'
|
||||||
|
|||||||
@@ -4,18 +4,21 @@ from fastapi import HTTPException, status, Depends
|
|||||||
from fastapi.security import HTTPBasic, HTTPBasicCredentials, HTTPBearer, HTTPAuthorizationCredentials
|
from fastapi.security import HTTPBasic, HTTPBasicCredentials, HTTPBearer, HTTPAuthorizationCredentials
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
GOOGLE_CHROME_APP_ID = os.environ.get("GOOGLE_CHROME_APP_ID")
|
CHROME_APP_IDS = set([app_id.strip() for app_id in os.environ.get("CHROME_APP_IDS", "").split(",")])
|
||||||
assert len(GOOGLE_CHROME_APP_ID)>10, "GOOGLE_CHROME_APP_ID env variable not set"
|
assert len(CHROME_APP_IDS) > 0, "CHROME_APP_IDS env variable not properly set, it's a csv"
|
||||||
ALLOWED_EMAILS = set([e.strip().lower() for e in os.environ.get("ALLOWED_EMAILS", "").split(",")])
|
for app_id in CHROME_APP_IDS:
|
||||||
assert len(GOOGLE_CHROME_APP_ID)>=1, "at least one ALLOWED_EMAILS is required from the env variable"
|
assert len(app_id) > 10, f"CHROME_APP_IDS got invalid id: {app_id} env variable not set"
|
||||||
logger.info(f"{len(ALLOWED_EMAILS)=}")
|
logger.info(f"{CHROME_APP_IDS=}")
|
||||||
|
|
||||||
|
BLOCKED_EMAILS = set([e.strip().lower() for e in os.environ.get("BLOCKED_EMAILS", "").split(",")])
|
||||||
|
logger.info(f"{len(BLOCKED_EMAILS)=}")
|
||||||
|
|
||||||
basic_security = HTTPBasic()
|
basic_security = HTTPBasic()
|
||||||
bearer_security = HTTPBearer()
|
bearer_security = HTTPBearer()
|
||||||
|
|
||||||
#--------------------- Bearer Auth
|
# --------------------- Bearer Auth
|
||||||
|
|
||||||
|
|
||||||
async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
|
async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(bearer_security)):
|
||||||
# validates the Bearer token in the case that it requires it
|
# validates the Bearer token in the case that it requires it
|
||||||
@@ -28,20 +31,20 @@ async def get_bearer_auth(credentials: HTTPAuthorizationCredentials = Depends(be
|
|||||||
detail=info,
|
detail=info,
|
||||||
headers={"WWW-Authenticate": "Bearer"},
|
headers={"WWW-Authenticate": "Bearer"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def authenticate_user(access_token):
|
def authenticate_user(access_token):
|
||||||
# https://cloud.google.com/docs/authentication/token-types#access
|
# https://cloud.google.com/docs/authentication/token-types#access
|
||||||
if type(access_token)!=str or len(access_token)<10: return False, "invalid access_token"
|
if type(access_token) != str or len(access_token) < 10: return False, "invalid access_token"
|
||||||
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token":access_token})
|
r = requests.get("https://oauth2.googleapis.com/tokeninfo", {"access_token": access_token})
|
||||||
if r.status_code!=200: return False, "error occurred"
|
if r.status_code != 200: return False, "error occurred"
|
||||||
try:
|
try:
|
||||||
j = r.json()
|
j = r.json()
|
||||||
if j.get("azp") != GOOGLE_CHROME_APP_ID and j.get("aud")!=GOOGLE_CHROME_APP_ID:
|
if j.get("azp") not in CHROME_APP_IDS and j.get("aud") not in CHROME_APP_IDS:
|
||||||
return False, f"token does not belong to correct APP_ID"
|
return False, f"token does not belong to valid APP_ID"
|
||||||
# if j.get("email") not in ALLOWED_EMAILS:
|
if j.get("email") in BLOCKED_EMAILS:
|
||||||
if not custom_is_email_allowed(j.get("email"), any_bellingcat_email=True):
|
|
||||||
return False, f"email '{j.get('email')}' not allowed"
|
return False, f"email '{j.get('email')}' not allowed"
|
||||||
if j.get("email_verified") != "true":
|
if j.get("email_verified") != "true":
|
||||||
return False, f"email '{j.get('email')}' not verified"
|
return False, f"email '{j.get('email')}' not verified"
|
||||||
if int(j.get("expires_in", -1)) <= 0:
|
if int(j.get("expires_in", -1)) <= 0:
|
||||||
return False, "Token expired"
|
return False, "Token expired"
|
||||||
@@ -50,12 +53,11 @@ def authenticate_user(access_token):
|
|||||||
logger.warning(f"EXCEPTION occurred: {e}")
|
logger.warning(f"EXCEPTION occurred: {e}")
|
||||||
return False, f"EXCEPTION occurred"
|
return False, f"EXCEPTION occurred"
|
||||||
|
|
||||||
def custom_is_email_allowed(email, any_bellingcat_email=False):
|
|
||||||
return email.lower() in ALLOWED_EMAILS or (any_bellingcat_email and re.match(r'^[\w.]+@bellingcat\.com$', email))
|
# --------------------- Basic Auth
|
||||||
|
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
|
||||||
|
|
||||||
|
|
||||||
#--------------------- Basic Auth
|
|
||||||
SFP = os.environ.get("STATIC_FILE_PASSWORD", "") # min length is 20 chars
|
|
||||||
async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_security)):
|
async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_security)):
|
||||||
# validates that the Basic token in the case that it requires it
|
# validates that the Basic token in the case that it requires it
|
||||||
assert len(SFP) >= 20, "Invalid STATIC_FILE_PASSWORD, must be at least 20 chars"
|
assert len(SFP) >= 20, "Invalid STATIC_FILE_PASSWORD, must be at least 20 chars"
|
||||||
@@ -66,4 +68,4 @@ async def get_basic_auth(credentials: HTTPBasicCredentials = Depends(basic_secur
|
|||||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
detail="Wrong static file access credentials",
|
detail="Wrong static file access credentials",
|
||||||
headers={"WWW-Authenticate": "Basic"}
|
headers={"WWW-Authenticate": "Basic"}
|
||||||
)
|
)
|
||||||
|
|||||||
116
src/worker.py
116
src/worker.py
@@ -1,22 +1,21 @@
|
|||||||
|
|
||||||
import os, re, traceback, yaml
|
import os, traceback, yaml, datetime
|
||||||
|
from typing import List, Set
|
||||||
|
|
||||||
from celery import Celery, states
|
from celery import Celery
|
||||||
from celery.exceptions import Ignore
|
|
||||||
from celery.signals import task_failure
|
from celery.signals import task_failure
|
||||||
from auto_archiver import Config, ArchivingOrchestrator, Metadata
|
from auto_archiver import Config, ArchivingOrchestrator, Metadata
|
||||||
# from auto_archiver.enrichers import ScreenshotEnricher
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from db import crud, schemas, models
|
from db import crud, schemas, models
|
||||||
from db.database import engine, SessionLocal
|
from db.database import SessionLocal
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import json
|
import json
|
||||||
|
|
||||||
celery = Celery(__name__)
|
celery = Celery(__name__)
|
||||||
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://localhost:6379")
|
||||||
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://localhost:6379")
|
||||||
USER_GROUPS_FILENAME=os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml")
|
USER_GROUPS_FILENAME = os.environ.get("USER_GROUPS_FILENAME", "user-groups.yaml")
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
@@ -25,38 +24,56 @@ def get_db():
|
|||||||
try: yield session
|
try: yield session
|
||||||
finally: session.close()
|
finally: session.close()
|
||||||
|
|
||||||
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
|
|
||||||
|
@celery.task(name="create_archive_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 5})
|
||||||
def create_archive_task(self, archive_json: str):
|
def create_archive_task(self, archive_json: str):
|
||||||
|
|
||||||
archive = schemas.ArchiveCreate.parse_raw(archive_json)
|
archive = schemas.ArchiveCreate.parse_raw(archive_json)
|
||||||
if not archive.public and archive.group_id and len(archive.group_id) > 0:
|
|
||||||
# ensure group is valid for user
|
if (em := is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)): return {"error": em}
|
||||||
with get_db() as session:
|
|
||||||
db_group = crud.get_group_for_user(session, archive.group_id, archive.author_id)
|
|
||||||
if not db_group:
|
|
||||||
logger.error(em := f"User {archive.author_id} is not part of {archive.group_id}, no permission")
|
|
||||||
return {"error": em}
|
|
||||||
|
|
||||||
url = archive.url
|
url = archive.url
|
||||||
logger.info(f"{url=}")
|
logger.info(f"{url=} {archive=}")
|
||||||
logger.info(f"{archive=}")
|
|
||||||
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
|
orchestrator = choose_orchestrator(archive.group_id, archive.author_id)
|
||||||
result = orchestrator.feed_item(Metadata().set_url(url))
|
result = orchestrator.feed_item(Metadata().set_url(url))
|
||||||
if not result:
|
|
||||||
logger.error(f"UNABLE TO archive: {url}")
|
|
||||||
return {"error": "unable to archive"}
|
|
||||||
|
|
||||||
result_json = result.to_json()
|
try:
|
||||||
with get_db() as session:
|
insert_result_into_db(result, archive.tags, archive.public, archive.group_id, archive.author_id, self.request.id)
|
||||||
# create DB URLs
|
except Exception as e:
|
||||||
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls]
|
logger.error(e)
|
||||||
# create DB TAGs if needed
|
logger.error(traceback.format_exc())
|
||||||
db_tags = [crud.create_tag(session, tag) for tag in archive.tags]
|
return {"error": e}
|
||||||
# insert archive
|
return result.to_json()
|
||||||
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=self.request.id, url=url, result=json.loads(result_json), public=archive.public, author_id=archive.author_id, group_id=archive.group_id), tags=db_tags, urls=db_urls)
|
|
||||||
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
|
|
||||||
return result_json
|
|
||||||
|
|
||||||
|
|
||||||
|
@celery.task(name="create_sheet_task", bind=True, autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={'max_retries': 0})
|
||||||
|
def create_sheet_task(self, sheet_json: str):
|
||||||
|
sheet = schemas.SubmitSheet.parse_raw(sheet_json)
|
||||||
|
sheet.tags.add("gsheet")
|
||||||
|
logger.info(f"SHEET START {sheet=}")
|
||||||
|
|
||||||
|
if (em := is_group_invalid_for_user(sheet.public, sheet.group_id, sheet.author_id)): return {"error": em}
|
||||||
|
|
||||||
|
config = Config()
|
||||||
|
#TODO: use choose_orchestrator and overwrite the feeder
|
||||||
|
config.parse(use_cli=False, yaml_config_filename="secrets/orchestration-sheet.yaml", overwrite_configs={"configurations": {"gsheet_feeder": {"sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "header": sheet.header}}})
|
||||||
|
orchestrator = ArchivingOrchestrator(config)
|
||||||
|
|
||||||
|
stats = {"archived": 0, "failed": 0, "errors": []}
|
||||||
|
for result in orchestrator.feed():
|
||||||
|
try:
|
||||||
|
insert_result_into_db(result, sheet.tags, sheet.public, sheet.group_id, sheet.author_id, models.generate_uuid())
|
||||||
|
stats["archived"]+=1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
stats["failed"]+=1
|
||||||
|
stats["errors"].append(e)
|
||||||
|
|
||||||
|
logger.info(f"SHEET DONE {sheet=}")
|
||||||
|
return {"success": True, "sheet": sheet.sheet_name, "sheet_id": sheet.sheet_id, "time": datetime.datetime.now().isoformat(), **stats}
|
||||||
|
|
||||||
|
|
||||||
|
@task_failure.connect(sender=create_sheet_task)
|
||||||
@task_failure.connect(sender=create_archive_task)
|
@task_failure.connect(sender=create_archive_task)
|
||||||
def task_failure_notifier(sender=None, **kwargs):
|
def task_failure_notifier(sender=None, **kwargs):
|
||||||
logger.warning("😅 From task_failure_notifier ==> Task failed successfully! ")
|
logger.warning("😅 From task_failure_notifier ==> Task failed successfully! ")
|
||||||
@@ -64,6 +81,7 @@ def task_failure_notifier(sender=None, **kwargs):
|
|||||||
logger.error(kwargs['traceback'])
|
logger.error(kwargs['traceback'])
|
||||||
logger.error("\n".join(traceback.format_list(traceback.extract_tb(kwargs['traceback']))))
|
logger.error("\n".join(traceback.format_list(traceback.extract_tb(kwargs['traceback']))))
|
||||||
|
|
||||||
|
|
||||||
def choose_orchestrator(group, email):
|
def choose_orchestrator(group, email):
|
||||||
global ORCHESTRATORS
|
global ORCHESTRATORS
|
||||||
if group not in ORCHESTRATORS: group = get_user_first_group(email)
|
if group not in ORCHESTRATORS: group = get_user_first_group(email)
|
||||||
@@ -71,6 +89,7 @@ def choose_orchestrator(group, email):
|
|||||||
logger.info(f"CHOOSE Orchestrator for {group=}, {email=}")
|
logger.info(f"CHOOSE Orchestrator for {group=}, {email=}")
|
||||||
return ArchivingOrchestrator(ORCHESTRATORS.get(group))
|
return ArchivingOrchestrator(ORCHESTRATORS.get(group))
|
||||||
|
|
||||||
|
|
||||||
def read_user_groups():
|
def read_user_groups():
|
||||||
# read yaml safely
|
# read yaml safely
|
||||||
with open(USER_GROUPS_FILENAME) as inf:
|
with open(USER_GROUPS_FILENAME) as inf:
|
||||||
@@ -80,6 +99,7 @@ def read_user_groups():
|
|||||||
logger.error(f"could not open user groups filename {USER_GROUPS_FILENAME}: {e}")
|
logger.error(f"could not open user groups filename {USER_GROUPS_FILENAME}: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
|
||||||
def get_user_first_group(email):
|
def get_user_first_group(email):
|
||||||
user_groups_yaml = read_user_groups()
|
user_groups_yaml = read_user_groups()
|
||||||
groups = user_groups_yaml.get("users", {}).get(email, [])
|
groups = user_groups_yaml.get("users", {}).get(email, [])
|
||||||
@@ -94,12 +114,12 @@ def load_orchestrators():
|
|||||||
reads the orchestrators key in the config file to load different orchestrators for different groups
|
reads the orchestrators key in the config file to load different orchestrators for different groups
|
||||||
"""
|
"""
|
||||||
user_groups_yaml = read_user_groups()
|
user_groups_yaml = read_user_groups()
|
||||||
|
|
||||||
orchestrators_config = user_groups_yaml.get("orchestrators", {})
|
orchestrators_config = user_groups_yaml.get("orchestrators", {})
|
||||||
assert len(orchestrators_config), f"No orchestrators key found in {USER_GROUPS_FILENAME}. please see the example file"
|
assert len(orchestrators_config), f"No orchestrators key found in {USER_GROUPS_FILENAME}. please see the example file"
|
||||||
assert "default" in orchestrators_config, "please include a 'default' orchestrator to be used when the user has no group"
|
assert "default" in orchestrators_config, "please include a 'default' orchestrator to be used when the user has no group"
|
||||||
logger.debug(f"Found {len(orchestrators_config)} group orchestrators.")
|
logger.debug(f"Found {len(orchestrators_config)} group orchestrators.")
|
||||||
|
|
||||||
for group, config_filename in orchestrators_config.items():
|
for group, config_filename in orchestrators_config.items():
|
||||||
config = Config()
|
config = Config()
|
||||||
config.parse(use_cli=False, yaml_config_filename=config_filename)
|
config.parse(use_cli=False, yaml_config_filename=config_filename)
|
||||||
@@ -107,7 +127,35 @@ def load_orchestrators():
|
|||||||
return ORCHESTRATORS
|
return ORCHESTRATORS
|
||||||
|
|
||||||
|
|
||||||
## INIT
|
def is_group_invalid_for_user(public: bool, group_id: str, author_id: str):
|
||||||
|
"""
|
||||||
|
ensures that, if a group is specified, the user belongs to it.
|
||||||
|
if public is true the requirement is not needed
|
||||||
|
returns an error message if invalid, or False if all is good.
|
||||||
|
"""
|
||||||
|
if not public and group_id and len(group_id) > 0:
|
||||||
|
# ensure group is valid for user
|
||||||
|
with get_db() as session:
|
||||||
|
db_group = crud.get_group_for_user(session, group_id, author_id)
|
||||||
|
if not db_group:
|
||||||
|
logger.error(em := f"User {author_id} is not part of {group_id}, no permission")
|
||||||
|
return em
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def insert_result_into_db(result: Metadata, tags: Set[str], public: bool, group_id: str, author_id: str, task_id:str):
|
||||||
|
logger.info(f"INSERTING {public=} {result} into {task_id}")
|
||||||
|
assert result, "UNABLE TO archive: {url}"
|
||||||
|
with get_db() as session:
|
||||||
|
# create DB URLs
|
||||||
|
db_urls = [models.ArchiveUrl(url=url, key=m.get("id", f"media_{i}")) for i, m in enumerate(result.media) for url in m.urls]
|
||||||
|
# create DB TAGs if needed
|
||||||
|
db_tags = [crud.create_tag(session, tag) for tag in tags]
|
||||||
|
# insert archive
|
||||||
|
db_task = crud.create_task(session, task=schemas.ArchiveCreate(id=task_id, url=result.get_url(), result=json.loads(result.to_json()), public=public, author_id=author_id, group_id=group_id), tags=db_tags, urls=db_urls)
|
||||||
|
logger.debug(f"Added {db_task.id=} to database on {db_task.created_at}")
|
||||||
|
|
||||||
|
|
||||||
|
# INIT
|
||||||
ORCHESTRATORS = {}
|
ORCHESTRATORS = {}
|
||||||
load_orchestrators()
|
load_orchestrators()
|
||||||
|
|||||||
Reference in New Issue
Block a user