diff --git a/app/shared/utils/sheets.py b/app/shared/utils/sheets.py new file mode 100644 index 0000000..d2b23a9 --- /dev/null +++ b/app/shared/utils/sheets.py @@ -0,0 +1,133 @@ +"""Utilities for checking Google Sheet access permissions.""" + +from functools import lru_cache + +import requests as http_requests +import yaml + +from app.shared.log import logger + + +@lru_cache(maxsize=32) +def get_service_account_json_path(orchestrator_sheet_path: str) -> str | None: + """ + Extract the service account JSON file path from an orchestrator sheet + YAML config. + + Returns: + Path to the service account JSON file, or None if not found. + """ + if not orchestrator_sheet_path: + return None + + try: + with open(orchestrator_sheet_path) as f: + orch = yaml.safe_load(f) + except Exception as e: + logger.warning( + f"Could not read orchestrator sheet config {orchestrator_sheet_path}: {e}" + ) + return None + + if not isinstance(orch, dict): + return None + + def find_key(d: dict, key: str): + for k, v in d.items(): + if k == key: + return v + if isinstance(v, dict): + if result := find_key(v, key): + return result + return None + + return find_key(orch, "service_account") + + +def check_sheet_write_access( + service_account_json_path: str, sheet_id: str +) -> bool | None: + """ + Check if a Google service account has write (Editor) access to a Google + Sheet using the Google Drive API. + + Returns: + True: Service account has write access. + False: Service account does NOT have write access (or no access). + None: Could not determine (network/auth error) — caller should + proceed with archiving and let it fail naturally. + """ + try: + from google.auth.transport.requests import Request + from google.oauth2.service_account import Credentials + + creds = Credentials.from_service_account_file( + service_account_json_path, + scopes=["https://www.googleapis.com/auth/drive.metadata.readonly"], + ) + creds.refresh(Request()) + + resp = http_requests.get( + f"https://www.googleapis.com/drive/v3/files/{sheet_id}", + params={ + "fields": "capabilities/canEdit", + "supportsAllDrives": "true", + }, + headers={"Authorization": f"Bearer {creds.token}"}, + timeout=10, + ) + + if resp.status_code == 404: + return False + if resp.status_code == 403: + return False + if resp.status_code == 200: + return resp.json().get("capabilities", {}).get("canEdit", False) + + logger.warning( + f"Unexpected Drive API response {resp.status_code} for sheet {sheet_id}: {resp.text}" + ) + return None + + except FileNotFoundError: + logger.error( + f"Service account JSON not found: {service_account_json_path}" + ) + return None + except Exception as e: + logger.warning( + f"Could not check write access for sheet {sheet_id}: {e}" + ) + return None + + +def get_sheet_access_error( + orchestrator_sheet_path: str | None, + service_account_email: str | None, + sheet_id: str, +) -> str | None: + """ + Check if the service account has write access to a Google Sheet. + + Returns: + An error message string if the sheet is NOT accessible, or None if + access is OK (or the check is indeterminate). + """ + if not orchestrator_sheet_path: + return None + + sa_json_path = get_service_account_json_path(orchestrator_sheet_path) + if not sa_json_path: + return None + + has_access = check_sheet_write_access(sa_json_path, sheet_id) + if has_access is False: + sa_display = ( + service_account_email or "the Auto Archiver service account" + ) + return ( + f"The Google Sheet has not been shared with the Auto Archiver " + f"service account ({sa_display}). Please share the sheet with " + f"this email address and give it Editor permissions." + ) + return None diff --git a/app/tests/shared/utils/test_sheets.py b/app/tests/shared/utils/test_sheets.py new file mode 100644 index 0000000..ceb9160 --- /dev/null +++ b/app/tests/shared/utils/test_sheets.py @@ -0,0 +1,171 @@ +from unittest.mock import mock_open, patch + +from app.shared.utils.sheets import ( + check_sheet_write_access, + get_service_account_json_path, + get_sheet_access_error, +) + + +class TestGetServiceAccountJsonPath: + def test_returns_none_for_empty_path(self): + assert get_service_account_json_path("") is None + assert get_service_account_json_path(None) is None + + def test_returns_path_from_orchestrator_yaml(self): + # The test orchestration file has a service_account key + get_service_account_json_path.cache_clear() + result = get_service_account_json_path( + "app/tests/orchestration.test.yaml" + ) + assert result == "app/tests/fake_service_account.json" + + def test_returns_none_for_missing_file(self): + get_service_account_json_path.cache_clear() + assert get_service_account_json_path("nonexistent/path.yaml") is None + + def test_returns_none_for_invalid_yaml(self): + get_service_account_json_path.cache_clear() + with patch( + "builtins.open", mock_open(read_data="!!! invalid yaml {{{") + ): + result = get_service_account_json_path("some/path.yaml") + # yaml.safe_load may return a string for some invalid inputs + # The function should not crash + assert result is None or isinstance(result, str) + + def test_returns_none_when_no_service_account_key(self): + get_service_account_json_path.cache_clear() + yaml_content = "steps:\n feeders:\n - cli_feeder\n" + with patch("builtins.open", mock_open(read_data=yaml_content)): + assert get_service_account_json_path("some/path.yaml") is None + + def test_finds_nested_service_account_key(self): + get_service_account_json_path.cache_clear() + yaml_content = ( + "configurations:\n" + " gsheet_feeder_db:\n" + " service_account: secrets/nested_sa.json\n" + ) + with patch("builtins.open", mock_open(read_data=yaml_content)): + result = get_service_account_json_path("some/path.yaml") + assert result == "secrets/nested_sa.json" + + +class TestCheckSheetWriteAccess: + @patch("app.shared.utils.sheets.http_requests.get") + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file" + ) + def test_returns_true_when_can_edit(self, m_creds, m_get): + m_creds.return_value.token = "fake-token" + m_get.return_value.status_code = 200 + m_get.return_value.json.return_value = { + "capabilities": {"canEdit": True} + } + + result = check_sheet_write_access("sa.json", "sheet123") + assert result is True + m_get.assert_called_once() + + @patch("app.shared.utils.sheets.http_requests.get") + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file" + ) + def test_returns_false_when_cannot_edit(self, m_creds, m_get): + m_creds.return_value.token = "fake-token" + m_get.return_value.status_code = 200 + m_get.return_value.json.return_value = { + "capabilities": {"canEdit": False} + } + + result = check_sheet_write_access("sa.json", "sheet123") + assert result is False + + @patch("app.shared.utils.sheets.http_requests.get") + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file" + ) + def test_returns_false_on_404(self, m_creds, m_get): + m_creds.return_value.token = "fake-token" + m_get.return_value.status_code = 404 + + result = check_sheet_write_access("sa.json", "sheet123") + assert result is False + + @patch("app.shared.utils.sheets.http_requests.get") + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file" + ) + def test_returns_false_on_403(self, m_creds, m_get): + m_creds.return_value.token = "fake-token" + m_get.return_value.status_code = 403 + + result = check_sheet_write_access("sa.json", "sheet123") + assert result is False + + @patch("app.shared.utils.sheets.http_requests.get") + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file" + ) + def test_returns_none_on_unexpected_status(self, m_creds, m_get): + m_creds.return_value.token = "fake-token" + m_get.return_value.status_code = 500 + m_get.return_value.text = "Internal Server Error" + + result = check_sheet_write_access("sa.json", "sheet123") + assert result is None + + def test_returns_none_when_file_not_found(self): + result = check_sheet_write_access("nonexistent/sa.json", "sheet123") + assert result is None + + @patch( + "google.oauth2.service_account.Credentials.from_service_account_file", + side_effect=Exception("auth failed"), + ) + def test_returns_none_on_auth_error(self, m_creds): + result = check_sheet_write_access("sa.json", "sheet123") + assert result is None + + +class TestGetSheetAccessError: + @patch("app.shared.utils.sheets.check_sheet_write_access") + @patch("app.shared.utils.sheets.get_service_account_json_path") + def test_returns_none_when_access_ok(self, m_get_path, m_check): + m_get_path.return_value = "sa.json" + m_check.return_value = True + + result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1") + assert result is None + + @patch("app.shared.utils.sheets.check_sheet_write_access") + @patch("app.shared.utils.sheets.get_service_account_json_path") + def test_returns_error_when_no_access(self, m_get_path, m_check): + m_get_path.return_value = "sa.json" + m_check.return_value = False + + result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1") + assert result is not None + assert "sa@test.com" in result + assert "Editor" in result + + @patch("app.shared.utils.sheets.check_sheet_write_access") + @patch("app.shared.utils.sheets.get_service_account_json_path") + def test_returns_none_when_indeterminate(self, m_get_path, m_check): + m_get_path.return_value = "sa.json" + m_check.return_value = None + + result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1") + assert result is None + + def test_returns_none_when_no_orchestrator_path(self): + assert get_sheet_access_error(None, "sa@test.com", "sheet1") is None + assert get_sheet_access_error("", "sa@test.com", "sheet1") is None + + @patch("app.shared.utils.sheets.get_service_account_json_path") + def test_returns_none_when_no_sa_json_path(self, m_get_path): + m_get_path.return_value = None + + result = get_sheet_access_error("orch.yaml", "sa@test.com", "sheet1") + assert result is None diff --git a/app/tests/web/routers/test_sheet.py b/app/tests/web/routers/test_sheet.py index 2885598..37e8694 100644 --- a/app/tests/web/routers/test_sheet.py +++ b/app/tests/web/routers/test_sheet.py @@ -344,3 +344,133 @@ class TestArchiveUserSheetEndpoint: assert r.json() == { "detail": "User cannot manually trigger sheet archiving in this group." } + + +class TestSheetAccessPermissionCheck: + """Tests for the Google Sheet write access permission check.""" + + ERROR_MSG = ( + "The Google Sheet has not been shared with the Auto Archiver " + "service account (sa@test.iam.gserviceaccount.com). Please " + "share the sheet with this email address and give it Editor " + "permissions." + ) + + @patch( + "app.web.routers.sheet.get_sheet_access_error", + return_value=ERROR_MSG, + ) + def test_create_sheet_no_write_access( + self, m_access, app_with_auth, db_session + ): + """Sheet creation is blocked when the SA has no write access.""" + client = TestClient(app_with_auth) + data = { + "id": "no-access-sheet", + "name": "Test Sheet", + "group_id": "spaceship", + "frequency": "daily", + } + r = client.post("/sheet/create", json=data) + assert r.status_code == HTTPStatus.FORBIDDEN + assert "service account" in r.json()["detail"] + m_access.assert_called_once() + + @patch( + "app.web.routers.sheet.get_sheet_access_error", + return_value=None, + ) + def test_create_sheet_access_indeterminate_proceeds( + self, m_access, app_with_auth, db_session + ): + """Sheet creation proceeds when access check is indeterminate.""" + client = TestClient(app_with_auth) + data = { + "id": "maybe-access-sheet", + "name": "Test Sheet", + "group_id": "spaceship", + "frequency": "daily", + } + r = client.post("/sheet/create", json=data) + assert r.status_code == HTTPStatus.CREATED + m_access.assert_called_once() + + @patch( + "app.web.routers.sheet.get_sheet_access_error", + return_value=ERROR_MSG, + ) + def test_archive_sheet_no_write_access( + self, m_access, app_with_auth, db_session + ): + """Manual trigger is blocked when the SA has no write access.""" + db_session.add( + models.Sheet( + id="123-sheet-id", + name="Test Sheet 1", + author_id="rick@example.com", + group_id="spaceship", + frequency="hourly", + ) + ) + db_session.commit() + client = TestClient(app_with_auth) + r = client.post("/sheet/123-sheet-id/archive") + assert r.status_code == HTTPStatus.FORBIDDEN + assert "service account" in r.json()["detail"] + assert "Editor" in r.json()["detail"] + m_access.assert_called_once() + + @patch("app.web.routers.sheet.celery", return_value=MagicMock()) + @patch( + "app.web.routers.sheet.get_sheet_access_error", + return_value=None, + ) + def test_archive_sheet_access_ok_proceeds( + self, m_access, m_celery, app_with_auth, db_session + ): + """Manual trigger proceeds when access check passes.""" + db_session.add( + models.Sheet( + id="123-sheet-id", + name="Test Sheet 1", + author_id="rick@example.com", + group_id="spaceship", + frequency="hourly", + ) + ) + db_session.commit() + + m_signature = MagicMock() + m_signature.apply_async.return_value = TaskResult( + id="task-123", status=STATUS_PENDING, result="" + ) + m_celery.signature.return_value = m_signature + + client = TestClient(app_with_auth) + r = client.post("/sheet/123-sheet-id/archive") + assert r.status_code == HTTPStatus.CREATED + m_access.assert_called_once() + m_celery.signature.assert_called_once() + + @patch( + "app.web.routers.sheet.get_sheet_access_error", + return_value=ERROR_MSG, + ) + def test_token_archive_sheet_no_write_access( + self, m_access, app_with_token, db_session + ): + """API token trigger is also blocked when SA has no write access.""" + db_session.add( + models.Sheet( + id="token-sheet-id", + name="Token Sheet", + author_id="rick@example.com", + group_id="spaceship", + frequency="hourly", + ) + ) + db_session.commit() + client = TestClient(app_with_token) + r = client.post("/sheet/token-sheet-id/archive") + assert r.status_code == HTTPStatus.FORBIDDEN + assert "service account" in r.json()["detail"] diff --git a/app/web/events.py b/app/web/events.py index 8e07c61..9f9c0b3 100644 --- a/app/web/events.py +++ b/app/web/events.py @@ -20,6 +20,7 @@ from app.shared.db.database import ( from app.shared.log import logger from app.shared.settings import get_settings from app.shared.task_messaging import get_celery +from app.shared.utils.sheets import get_sheet_access_error from app.web.db import crud from app.web.middleware import increase_exceptions_counter from app.web.utils.metrics import ( @@ -30,6 +31,11 @@ from app.web.utils.metrics import ( celery = get_celery() +# Throttle cache: track when each sheet was last notified about missing +# permissions so users are not spammed on every cron cycle. +_sheet_no_access_notified: dict[str, datetime.datetime] = {} +_NOTIFY_COOLDOWN = datetime.timedelta(hours=24) + @asynccontextmanager async def lifespan(app: FastAPI): @@ -113,12 +119,31 @@ async def archive_sheets_cronjob( frequency: str, interval: int, current_time_unit: int ): triggered_jobs = [] + no_access_sheets: dict[str, list[tuple]] = defaultdict(list) async with get_db_async() as db: sheets = await crud.get_sheets_by_id_hash( db, frequency, str(interval), current_time_unit ) for s in sheets: + # Check if service account has write access to the sheet + group = await db.get(models.Group, s.group_id) + if group and group.orchestrator_sheet: + access_error = get_sheet_access_error( + group.orchestrator_sheet, + group.service_account_email, + s.id, + ) + if access_error: + no_access_sheets[s.author_id].append( + (s, group.service_account_email or "") + ) + logger.warning( + f"[CRON] Skipping sheet {s.id}: not shared with " + f"service account {group.service_account_email}" + ) + continue + group_queue = await crud.get_group_priority_async(db, s.group_id) task = celery.signature( "create_sheet_task", @@ -132,11 +157,75 @@ async def archive_sheets_cronjob( ).apply_async(**group_queue) triggered_jobs.append({"sheet_id": s.id, "task_id": task.id}) + + if no_access_sheets: + await _notify_sheet_permission_issues(no_access_sheets) + logger.debug( f"[CRON {frequency.upper()}:{current_time_unit}] Triggered {len(triggered_jobs)} sheet tasks: {triggered_jobs}" ) +async def _notify_sheet_permission_issues( + no_access_sheets: dict[str, list[tuple]], +): + """ + Send email notifications to users whose sheets are not shared with the + Auto Archiver service account. Throttled to at most one email per sheet + per 24 hours to avoid spamming. + """ + now = datetime.datetime.now() + fastmail = FastMail(get_settings().mail_config) + + for email, sheet_infos in no_access_sheets.items(): + # Filter to sheets that haven't been notified recently + sheets_to_notify = [] + for s, sa_email in sheet_infos: + last = _sheet_no_access_notified.get(s.id) + if not last or (now - last) >= _NOTIFY_COOLDOWN: + sheets_to_notify.append((s, sa_email)) + _sheet_no_access_notified[s.id] = now + + if not sheets_to_notify: + continue + + list_of_sheets = "\n".join( + [ + f'
  • ' + f"{s.name} — share with {sa_email}
  • " + for s, sa_email in sheets_to_notify + ] + ) + message = MessageSchema( + subject="Auto Archiver: Sheet Permission Issue", + recipients=[email], + body=f""" + + +

    Hi {email},

    +

    The following sheets could not be archived because they have + not been shared with the Auto Archiver service account with + Editor permissions:

    + +

    Please open each sheet, click Share, and add the + service account email shown above as an Editor. + The sheets will be archived automatically on the next scheduled + run once access is granted.

    +

    Best,
    The Auto Archiver team

    + + + """, + subtype=MessageType.html, + ) + await fastmail.send_message(message) + logger.debug( + f"[CRON] Email sent to {email} about {len(sheets_to_notify)} " + f"sheet(s) with permission issues." + ) + + # TODO: on exception should logerror but also prometheus counter DELETE_WINDOW = ( get_settings().DELETE_SCHEDULED_ARCHIVES_CHECK_EVERY_N_DAYS * 24 * 60 * 60 diff --git a/app/web/routers/sheet.py b/app/web/routers/sheet.py index 9bac061..03c4a29 100644 --- a/app/web/routers/sheet.py +++ b/app/web/routers/sheet.py @@ -5,6 +5,7 @@ from fastapi.responses import JSONResponse from sqlalchemy import exc from sqlalchemy.orm import Session +from app.shared.db import models from app.shared.db.database import get_db_dependency from app.shared.schemas import ( DeleteResponse, @@ -13,6 +14,7 @@ from app.shared.schemas import ( SubmitSheet, ) from app.shared.task_messaging import get_celery +from app.shared.utils.sheets import get_sheet_access_error from app.web.config import ALLOW_ANY_EMAIL from app.web.db import crud from app.web.db.user_state import UserState @@ -53,6 +55,22 @@ def create_sheet( detail="Invalid frequency selected for this group.", ) + # Check if the service account has write access to the Google Sheet + group = ( + db.query(models.Group).filter(models.Group.id == sheet.group_id).first() + ) + if group: + access_error = get_sheet_access_error( + group.orchestrator_sheet, + group.service_account_email, + sheet.id, + ) + if access_error: + raise HTTPException( + status_code=HTTPStatus.FORBIDDEN, + detail=access_error, + ) + try: return crud.create_sheet( db, @@ -138,6 +156,22 @@ def archive_user_sheet( group_queue = user.priority_group(sheet.group_id) author_id = user.email + # Check if the service account has write access to the Google Sheet + group = ( + db.query(models.Group).filter(models.Group.id == sheet.group_id).first() + ) + if group: + access_error = get_sheet_access_error( + group.orchestrator_sheet, + group.service_account_email, + sheet_id, + ) + if access_error: + raise HTTPException( + status_code=HTTPStatus.FORBIDDEN, + detail=access_error, + ) + task = celery.signature( "create_sheet_task", args=[ diff --git a/app/worker/main.py b/app/worker/main.py index 36569ca..11671c1 100644 --- a/app/worker/main.py +++ b/app/worker/main.py @@ -13,6 +13,7 @@ from app.shared.log import log_error from app.shared.settings import get_settings from app.shared.task_messaging import get_celery, get_redis from app.shared.utils.misc import get_all_urls +from app.shared.utils.sheets import get_sheet_access_error from app.worker.worker_log import logger, setup_celery_logger @@ -98,6 +99,30 @@ def create_sheet_task(self, sheet_json: str): ) logger.info(f"[queue={queue_name}] SHEET START {sheet=}") + # Early check: does the service account have write access to the sheet? + with get_db() as session: + group = worker_crud.get_group(session, sheet.group_id) + if group and group.orchestrator_sheet: + access_error = get_sheet_access_error( + group.orchestrator_sheet, + group.service_account_email, + sheet.sheet_id, + ) + if access_error: + logger.warning( + f"SHEET SKIPPED {sheet.sheet_id}: {access_error}" + ) + return schemas.CelerySheetTask( + success=False, + sheet_id=sheet.sheet_id, + time=datetime.datetime.now().isoformat(), + stats={ + "archived": 0, + "failed": 0, + "errors": [access_error], + }, + ).model_dump() + args = get_orchestrator_args( sheet.group_id, True, [constants.SHEET_ID, sheet.sheet_id] )