feat: check for proper spreadsheet service_account access before archiving, plus email notification

This commit is contained in:
msramalho
2026-03-03 13:05:27 +00:00
parent 62a3bc72f6
commit a4ec468cac
6 changed files with 582 additions and 0 deletions

133
app/shared/utils/sheets.py Normal file
View File

@@ -0,0 +1,133 @@
"""Utilities for checking Google Sheet access permissions."""
from functools import lru_cache
import requests as http_requests
import yaml
from app.shared.log import logger
@lru_cache(maxsize=32)
def get_service_account_json_path(orchestrator_sheet_path: str) -> str | None:
"""
Extract the service account JSON file path from an orchestrator sheet
YAML config.
Returns:
Path to the service account JSON file, or None if not found.
"""
if not orchestrator_sheet_path:
return None
try:
with open(orchestrator_sheet_path) as f:
orch = yaml.safe_load(f)
except Exception as e:
logger.warning(
f"Could not read orchestrator sheet config {orchestrator_sheet_path}: {e}"
)
return None
if not isinstance(orch, dict):
return None
def find_key(d: dict, key: str):
for k, v in d.items():
if k == key:
return v
if isinstance(v, dict):
if result := find_key(v, key):
return result
return None
return find_key(orch, "service_account")
def check_sheet_write_access(
service_account_json_path: str, sheet_id: str
) -> bool | None:
"""
Check if a Google service account has write (Editor) access to a Google
Sheet using the Google Drive API.
Returns:
True: Service account has write access.
False: Service account does NOT have write access (or no access).
None: Could not determine (network/auth error) — caller should
proceed with archiving and let it fail naturally.
"""
try:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
creds = Credentials.from_service_account_file(
service_account_json_path,
scopes=["https://www.googleapis.com/auth/drive.metadata.readonly"],
)
creds.refresh(Request())
resp = http_requests.get(
f"https://www.googleapis.com/drive/v3/files/{sheet_id}",
params={
"fields": "capabilities/canEdit",
"supportsAllDrives": "true",
},
headers={"Authorization": f"Bearer {creds.token}"},
timeout=10,
)
if resp.status_code == 404:
return False
if resp.status_code == 403:
return False
if resp.status_code == 200:
return resp.json().get("capabilities", {}).get("canEdit", False)
logger.warning(
f"Unexpected Drive API response {resp.status_code} for sheet {sheet_id}: {resp.text}"
)
return None
except FileNotFoundError:
logger.error(
f"Service account JSON not found: {service_account_json_path}"
)
return None
except Exception as e:
logger.warning(
f"Could not check write access for sheet {sheet_id}: {e}"
)
return None
def get_sheet_access_error(
orchestrator_sheet_path: str | None,
service_account_email: str | None,
sheet_id: str,
) -> str | None:
"""
Check if the service account has write access to a Google Sheet.
Returns:
An error message string if the sheet is NOT accessible, or None if
access is OK (or the check is indeterminate).
"""
if not orchestrator_sheet_path:
return None
sa_json_path = get_service_account_json_path(orchestrator_sheet_path)
if not sa_json_path:
return None
has_access = check_sheet_write_access(sa_json_path, sheet_id)
if has_access is False:
sa_display = (
service_account_email or "the Auto Archiver service account"
)
return (
f"The Google Sheet has not been shared with the Auto Archiver "
f"service account ({sa_display}). Please share the sheet with "
f"this email address and give it Editor permissions."
)
return None

View File

@@ -0,0 +1,171 @@
from unittest.mock import mock_open, patch
from app.shared.utils.sheets import (
check_sheet_write_access,
get_service_account_json_path,
get_sheet_access_error,
)
class TestGetServiceAccountJsonPath:
def test_returns_none_for_empty_path(self):
assert get_service_account_json_path("") is None
assert get_service_account_json_path(None) is None
def test_returns_path_from_orchestrator_yaml(self):
# The test orchestration file has a service_account key
get_service_account_json_path.cache_clear()
result = get_service_account_json_path(
"app/tests/orchestration.test.yaml"
)
assert result == "app/tests/fake_service_account.json"
def test_returns_none_for_missing_file(self):
get_service_account_json_path.cache_clear()
assert get_service_account_json_path("nonexistent/path.yaml") is None
def test_returns_none_for_invalid_yaml(self):
get_service_account_json_path.cache_clear()
with patch(
"builtins.open", mock_open(read_data="!!! invalid yaml {{{")
):
result = get_service_account_json_path("some/path.yaml")
# yaml.safe_load may return a string for some invalid inputs
# The function should not crash
assert result is None or isinstance(result, str)
def test_returns_none_when_no_service_account_key(self):
get_service_account_json_path.cache_clear()
yaml_content = "steps:\n feeders:\n - cli_feeder\n"
with patch("builtins.open", mock_open(read_data=yaml_content)):
assert get_service_account_json_path("some/path.yaml") is None
def test_finds_nested_service_account_key(self):
get_service_account_json_path.cache_clear()
yaml_content = (
"configurations:\n"
" gsheet_feeder_db:\n"
" service_account: secrets/nested_sa.json\n"
)
with patch("builtins.open", mock_open(read_data=yaml_content)):
result = get_service_account_json_path("some/path.yaml")
assert result == "secrets/nested_sa.json"
class TestCheckSheetWriteAccess:
@patch("app.shared.utils.sheets.http_requests.get")
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file"
)
def test_returns_true_when_can_edit(self, m_creds, m_get):
m_creds.return_value.token = "fake-token"
m_get.return_value.status_code = 200
m_get.return_value.json.return_value = {
"capabilities": {"canEdit": True}
}
result = check_sheet_write_access("sa.json", "sheet123")
assert result is True
m_get.assert_called_once()
@patch("app.shared.utils.sheets.http_requests.get")
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file"
)
def test_returns_false_when_cannot_edit(self, m_creds, m_get):
m_creds.return_value.token = "fake-token"
m_get.return_value.status_code = 200
m_get.return_value.json.return_value = {
"capabilities": {"canEdit": False}
}
result = check_sheet_write_access("sa.json", "sheet123")
assert result is False
@patch("app.shared.utils.sheets.http_requests.get")
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file"
)
def test_returns_false_on_404(self, m_creds, m_get):
m_creds.return_value.token = "fake-token"
m_get.return_value.status_code = 404
result = check_sheet_write_access("sa.json", "sheet123")
assert result is False
@patch("app.shared.utils.sheets.http_requests.get")
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file"
)
def test_returns_false_on_403(self, m_creds, m_get):
m_creds.return_value.token = "fake-token"
m_get.return_value.status_code = 403
result = check_sheet_write_access("sa.json", "sheet123")
assert result is False
@patch("app.shared.utils.sheets.http_requests.get")
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file"
)
def test_returns_none_on_unexpected_status(self, m_creds, m_get):
m_creds.return_value.token = "fake-token"
m_get.return_value.status_code = 500
m_get.return_value.text = "Internal Server Error"
result = check_sheet_write_access("sa.json", "sheet123")
assert result is None
def test_returns_none_when_file_not_found(self):
result = check_sheet_write_access("nonexistent/sa.json", "sheet123")
assert result is None
@patch(
"google.oauth2.service_account.Credentials.from_service_account_file",
side_effect=Exception("auth failed"),
)
def test_returns_none_on_auth_error(self, m_creds):
result = check_sheet_write_access("sa.json", "sheet123")
assert result is None
class TestGetSheetAccessError:
@patch("app.shared.utils.sheets.check_sheet_write_access")
@patch("app.shared.utils.sheets.get_service_account_json_path")
def test_returns_none_when_access_ok(self, m_get_path, m_check):
m_get_path.return_value = "sa.json"
m_check.return_value = True
result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1")
assert result is None
@patch("app.shared.utils.sheets.check_sheet_write_access")
@patch("app.shared.utils.sheets.get_service_account_json_path")
def test_returns_error_when_no_access(self, m_get_path, m_check):
m_get_path.return_value = "sa.json"
m_check.return_value = False
result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1")
assert result is not None
assert "sa@test.com" in result
assert "Editor" in result
@patch("app.shared.utils.sheets.check_sheet_write_access")
@patch("app.shared.utils.sheets.get_service_account_json_path")
def test_returns_none_when_indeterminate(self, m_get_path, m_check):
m_get_path.return_value = "sa.json"
m_check.return_value = None
result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1")
assert result is None
def test_returns_none_when_no_orchestrator_path(self):
assert get_sheet_access_error(None, "sa@test.com", "sheet1") is None
assert get_sheet_access_error("", "sa@test.com", "sheet1") is None
@patch("app.shared.utils.sheets.get_service_account_json_path")
def test_returns_none_when_no_sa_json_path(self, m_get_path):
m_get_path.return_value = None
result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1")
assert result is None

View File

@@ -344,3 +344,133 @@ class TestArchiveUserSheetEndpoint:
assert r.json() == {
"detail": "User cannot manually trigger sheet archiving in this group."
}
class TestSheetAccessPermissionCheck:
"""Tests for the Google Sheet write access permission check."""
ERROR_MSG = (
"The Google Sheet has not been shared with the Auto Archiver "
"service account (sa@test.iam.gserviceaccount.com). Please "
"share the sheet with this email address and give it Editor "
"permissions."
)
@patch(
"app.web.routers.sheet.get_sheet_access_error",
return_value=ERROR_MSG,
)
def test_create_sheet_no_write_access(
self, m_access, app_with_auth, db_session
):
"""Sheet creation is blocked when the SA has no write access."""
client = TestClient(app_with_auth)
data = {
"id": "no-access-sheet",
"name": "Test Sheet",
"group_id": "spaceship",
"frequency": "daily",
}
r = client.post("/sheet/create", json=data)
assert r.status_code == HTTPStatus.FORBIDDEN
assert "service account" in r.json()["detail"]
m_access.assert_called_once()
@patch(
"app.web.routers.sheet.get_sheet_access_error",
return_value=None,
)
def test_create_sheet_access_indeterminate_proceeds(
self, m_access, app_with_auth, db_session
):
"""Sheet creation proceeds when access check is indeterminate."""
client = TestClient(app_with_auth)
data = {
"id": "maybe-access-sheet",
"name": "Test Sheet",
"group_id": "spaceship",
"frequency": "daily",
}
r = client.post("/sheet/create", json=data)
assert r.status_code == HTTPStatus.CREATED
m_access.assert_called_once()
@patch(
"app.web.routers.sheet.get_sheet_access_error",
return_value=ERROR_MSG,
)
def test_archive_sheet_no_write_access(
self, m_access, app_with_auth, db_session
):
"""Manual trigger is blocked when the SA has no write access."""
db_session.add(
models.Sheet(
id="123-sheet-id",
name="Test Sheet 1",
author_id="rick@example.com",
group_id="spaceship",
frequency="hourly",
)
)
db_session.commit()
client = TestClient(app_with_auth)
r = client.post("/sheet/123-sheet-id/archive")
assert r.status_code == HTTPStatus.FORBIDDEN
assert "service account" in r.json()["detail"]
assert "Editor" in r.json()["detail"]
m_access.assert_called_once()
@patch("app.web.routers.sheet.celery", return_value=MagicMock())
@patch(
"app.web.routers.sheet.get_sheet_access_error",
return_value=None,
)
def test_archive_sheet_access_ok_proceeds(
self, m_access, m_celery, app_with_auth, db_session
):
"""Manual trigger proceeds when access check passes."""
db_session.add(
models.Sheet(
id="123-sheet-id",
name="Test Sheet 1",
author_id="rick@example.com",
group_id="spaceship",
frequency="hourly",
)
)
db_session.commit()
m_signature = MagicMock()
m_signature.apply_async.return_value = TaskResult(
id="task-123", status=STATUS_PENDING, result=""
)
m_celery.signature.return_value = m_signature
client = TestClient(app_with_auth)
r = client.post("/sheet/123-sheet-id/archive")
assert r.status_code == HTTPStatus.CREATED
m_access.assert_called_once()
m_celery.signature.assert_called_once()
@patch(
"app.web.routers.sheet.get_sheet_access_error",
return_value=ERROR_MSG,
)
def test_token_archive_sheet_no_write_access(
self, m_access, app_with_token, db_session
):
"""API token trigger is also blocked when SA has no write access."""
db_session.add(
models.Sheet(
id="token-sheet-id",
name="Token Sheet",
author_id="rick@example.com",
group_id="spaceship",
frequency="hourly",
)
)
db_session.commit()
client = TestClient(app_with_token)
r = client.post("/sheet/token-sheet-id/archive")
assert r.status_code == HTTPStatus.FORBIDDEN
assert "service account" in r.json()["detail"]

View File

@@ -20,6 +20,7 @@ from app.shared.db.database import (
from app.shared.log import logger
from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery
from app.shared.utils.sheets import get_sheet_access_error
from app.web.db import crud
from app.web.middleware import increase_exceptions_counter
from app.web.utils.metrics import (
@@ -30,6 +31,11 @@ from app.web.utils.metrics import (
celery = get_celery()
# Throttle cache: track when each sheet was last notified about missing
# permissions so users are not spammed on every cron cycle.
_sheet_no_access_notified: dict[str, datetime.datetime] = {}
_NOTIFY_COOLDOWN = datetime.timedelta(hours=24)
@asynccontextmanager
async def lifespan(app: FastAPI):
@@ -113,12 +119,31 @@ async def archive_sheets_cronjob(
frequency: str, interval: int, current_time_unit: int
):
triggered_jobs = []
no_access_sheets: dict[str, list[tuple]] = defaultdict(list)
async with get_db_async() as db:
sheets = await crud.get_sheets_by_id_hash(
db, frequency, str(interval), current_time_unit
)
for s in sheets:
# Check if service account has write access to the sheet
group = await db.get(models.Group, s.group_id)
if group and group.orchestrator_sheet:
access_error = get_sheet_access_error(
group.orchestrator_sheet,
group.service_account_email,
s.id,
)
if access_error:
no_access_sheets[s.author_id].append(
(s, group.service_account_email or "")
)
logger.warning(
f"[CRON] Skipping sheet {s.id}: not shared with "
f"service account {group.service_account_email}"
)
continue
group_queue = await crud.get_group_priority_async(db, s.group_id)
task = celery.signature(
"create_sheet_task",
@@ -132,11 +157,75 @@ async def archive_sheets_cronjob(
).apply_async(**group_queue)
triggered_jobs.append({"sheet_id": s.id, "task_id": task.id})
if no_access_sheets:
await _notify_sheet_permission_issues(no_access_sheets)
logger.debug(
f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}"
)
async def _notify_sheet_permission_issues(
no_access_sheets: dict[str, list[tuple]],
):
"""
Send email notifications to users whose sheets are not shared with the
Auto Archiver service account. Throttled to at most one email per sheet
per 24 hours to avoid spamming.
"""
now = datetime.datetime.now()
fastmail = FastMail(get_settings().mail_config)
for email, sheet_infos in no_access_sheets.items():
# Filter to sheets that haven't been notified recently
sheets_to_notify = []
for s, sa_email in sheet_infos:
last = _sheet_no_access_notified.get(s.id)
if not last or (now - last) >= _NOTIFY_COOLDOWN:
sheets_to_notify.append((s, sa_email))
_sheet_no_access_notified[s.id] = now
if not sheets_to_notify:
continue
list_of_sheets = "\n".join(
[
f'<li><a href="https://docs.google.com/spreadsheets/d/{s.id}">'
f"{s.name}</a> &mdash; share with <code>{sa_email}</code></li>"
for s, sa_email in sheets_to_notify
]
)
message = MessageSchema(
subject="Auto Archiver: Sheet Permission Issue",
recipients=[email],
body=f"""
<html>
<body>
<p>Hi {email},</p>
<p>The following sheets could not be archived because they have
not been shared with the Auto Archiver service account with
<strong>Editor</strong> permissions:</p>
<ul>
{list_of_sheets}
</ul>
<p>Please open each sheet, click <em>Share</em>, and add the
service account email shown above as an <strong>Editor</strong>.
The sheets will be archived automatically on the next scheduled
run once access is granted.</p>
<p>Best,<br>The Auto Archiver team</p>
</body>
</html>
""",
subtype=MessageType.html,
)
await fastmail.send_message(message)
logger.debug(
f"[CRON] Email sent to {email} about {len(sheets_to_notify)} "
f"sheet(s) with permission issues."
)
# TODO: on exception should logerror but also prometheus counter
DELETE_WINDOW = (
get_settings().DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS * 24 * 60 * 60

View File

@@ -5,6 +5,7 @@ from fastapi.responses import JSONResponse
from sqlalchemy import exc
from sqlalchemy.orm import Session
from app.shared.db import models
from app.shared.db.database import get_db_dependency
from app.shared.schemas import (
DeleteResponse,
@@ -13,6 +14,7 @@ from app.shared.schemas import (
SubmitSheet,
)
from app.shared.task_messaging import get_celery
from app.shared.utils.sheets import get_sheet_access_error
from app.web.config import ALLOW_ANY_EMAIL
from app.web.db import crud
from app.web.db.user_state import UserState
@@ -53,6 +55,22 @@ def create_sheet(
detail="Invalid frequency selected for this group.",
)
# Check if the service account has write access to the Google Sheet
group = (
db.query(models.Group).filter(models.Group.id == sheet.group_id).first()
)
if group:
access_error = get_sheet_access_error(
group.orchestrator_sheet,
group.service_account_email,
sheet.id,
)
if access_error:
raise HTTPException(
status_code=HTTPStatus.FORBIDDEN,
detail=access_error,
)
try:
return crud.create_sheet(
db,
@@ -138,6 +156,22 @@ def archive_user_sheet(
group_queue = user.priority_group(sheet.group_id)
author_id = user.email
# Check if the service account has write access to the Google Sheet
group = (
db.query(models.Group).filter(models.Group.id == sheet.group_id).first()
)
if group:
access_error = get_sheet_access_error(
group.orchestrator_sheet,
group.service_account_email,
sheet_id,
)
if access_error:
raise HTTPException(
status_code=HTTPStatus.FORBIDDEN,
detail=access_error,
)
task = celery.signature(
"create_sheet_task",
args=[

View File

@@ -13,6 +13,7 @@ from app.shared.log import log_error
from app.shared.settings import get_settings
from app.shared.task_messaging import get_celery, get_redis
from app.shared.utils.misc import get_all_urls
from app.shared.utils.sheets import get_sheet_access_error
from app.worker.worker_log import logger, setup_celery_logger
@@ -98,6 +99,30 @@ def create_sheet_task(self, sheet_json: str):
)
logger.info(f"[queue={queue_name}] SHEET START {sheet=}")
# Early check: does the service account have write access to the sheet?
with get_db() as session:
group = worker_crud.get_group(session, sheet.group_id)
if group and group.orchestrator_sheet:
access_error = get_sheet_access_error(
group.orchestrator_sheet,
group.service_account_email,
sheet.sheet_id,
)
if access_error:
logger.warning(
f"SHEET SKIPPED {sheet.sheet_id}: {access_error}"
)
return schemas.CelerySheetTask(
success=False,
sheet_id=sheet.sheet_id,
time=datetime.datetime.now().isoformat(),
stats={
"archived": 0,
"failed": 0,
"errors": [access_error],
},
).model_dump()
args = get_orchestrator_args(
sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id]
)