diff --git a/.env.alembic b/.env.alembic index 8691557..da29332 100644 --- a/.env.alembic +++ b/.env.alembic @@ -1,5 +1,5 @@ CHROME_APP_IDS='["1234567890"]' ALLOWED_ORIGINS='["allowed"]' BLOCKED_EMAILS='[]' -DATABASE_PATH="sqlite:///./database/auto-archiver.db" +DATABASE_PATH="sqlite:///./app/database/auto-archiver.db" API_BEARER_TOKEN=THIS_API_TOKEN_SHOULD_NEVER_BE_USED \ No newline at end of file diff --git a/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py b/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py new file mode 100644 index 0000000..7067746 --- /dev/null +++ b/app/migrations/versions/63ac79df4ad0_add_new_service_account_email_column_to_.py @@ -0,0 +1,36 @@ +"""add new service_account_email column to groups table + +Revision ID: 63ac79df4ad0 +Revises: 02b2f6d17ed0 +Create Date: 2025-02-11 21:53:23.293274 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '63ac79df4ad0' +down_revision = '02b2f6d17ed0' +branch_labels = None +depends_on = None + +NEW_COL = "service_account_email" +TABLE = "groups" + + +def upgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + columns = [col['name'] for col in inspector.get_columns(TABLE)] + + if NEW_COL not in columns: + op.add_column(TABLE, sa.Column(NEW_COL, sa.String, nullable=True, default=None)) + + +def downgrade() -> None: + conn = op.get_bind() + inspector = sa.inspect(conn) + columns = [col['name'] for col in inspector.get_columns(TABLE)] + if NEW_COL in columns: + op.drop_column(TABLE, NEW_COL) diff --git a/app/shared/db/models.py b/app/shared/db/models.py index 5447531..0e12c7b 100644 --- a/app/shared/db/models.py +++ b/app/shared/db/models.py @@ -87,6 +87,7 @@ class Group(Base): orchestrator = Column(String, default=None) orchestrator_sheet = Column(String, default=None) permissions = Column(JSON, default={}) + service_account_email = Column(String, default=None) domains = Column(JSON, default=[]) archives = relationship("Archive", back_populates="group") diff --git a/app/shared/settings.py b/app/shared/settings.py index 427de7e..039f4fd 100644 --- a/app/shared/settings.py +++ b/app/shared/settings.py @@ -33,11 +33,12 @@ class Settings(BaseSettings): # redis REDIS_PASSWORD: str = "" + REDIS_HOSTNAME: str = "localhost" @property def CELERY_BROKER_URL(self)-> str: if self.REDIS_PASSWORD: - return f"redis://:{self.REDIS_PASSWORD}@localhost:6379" - return "redis://localhost:6379" + return f"redis://:{self.REDIS_PASSWORD}@{self.REDIS_HOSTNAME}:6379" + return f"redis://{self.REDIS_HOSTNAME}:6379" REDIS_EXCEPTIONS_CHANNEL: str = "exceptions-channel" # observability diff --git a/app/shared/task_messaging.py b/app/shared/task_messaging.py index 6f57352..52dcba3 100644 --- a/app/shared/task_messaging.py +++ b/app/shared/task_messaging.py @@ -15,6 +15,4 @@ def get_celery(name:str="") -> Celery: def get_redis() -> redis.Redis: - from loguru import logger - logger.debug(get_settings().CELERY_BROKER_URL) return redis.Redis.from_url(get_settings().CELERY_BROKER_URL) diff --git a/app/shared/user_groups.py b/app/shared/user_groups.py index 8647ed9..592e012 100644 --- a/app/shared/user_groups.py +++ b/app/shared/user_groups.py @@ -1,7 +1,8 @@ +import json import os import yaml from loguru import logger -from pydantic import BaseModel, field_validator, Field, model_validator +from pydantic import BaseModel, computed_field, field_validator, Field, model_validator from typing import Dict, List, Set from typing_extensions import Self @@ -74,11 +75,39 @@ class GroupModel(BaseModel): permissions: GroupPermissions @field_validator('orchestrator', 'orchestrator_sheet', mode='before') - def validate_priority(cls, v): + def validate_orchestrator(cls, v): if not os.path.exists(v): raise ValueError(f"Orchestrator file not found with this path: {v}") return v + @computed_field + @property + def service_account_email(self) -> str: + if hasattr(self, "_service_account_email"): + return self._service_account_email + orch = yaml.safe_load(open(self.orchestrator_sheet)) + + def find_service_account_email(d): + for k, v in d.items(): + if k == "service_account": + return v + if isinstance(v, dict): + if result := find_service_account_email(v): + return result + return False + + service_account_json = find_service_account_email(orch) + if not service_account_json: + raise ValueError(f"service_account key not found in orchestrator sheet file: {self.orchestrator_sheet}.") + + with open(service_account_json) as f: + self._service_account_email = json.load(f).get("client_email") + + if not self._service_account_email: + raise ValueError(f"Service account email not found in {service_account_json}.") + + return self._service_account_email + class UserGroupModel(BaseModel): users: Dict[str, List[str]] = Field(default_factory=dict) @@ -137,4 +166,4 @@ class UserGroupModel(BaseModel): class GroupInfo(GroupPermissions): description: str = "" - service_account_emails: list[str] = [] + service_account_email: str = "" diff --git a/app/tests/fake_service_account.json b/app/tests/fake_service_account.json new file mode 100644 index 0000000..3d41bd9 --- /dev/null +++ b/app/tests/fake_service_account.json @@ -0,0 +1,3 @@ +{ + "client_email": "fake_service_account@fake_service_account.iam.gserviceaccount.com" +} \ No newline at end of file diff --git a/app/tests/orchestration.test.yaml b/app/tests/orchestration.test.yaml index cb79ea9..4ee1880 100644 --- a/app/tests/orchestration.test.yaml +++ b/app/tests/orchestration.test.yaml @@ -12,6 +12,8 @@ steps: - console_db configurations: + gsheet_feeder: + service_account: "app/tests/fake_service_account.json" cli_feeder: urls: - "url1" diff --git a/app/tests/web/db/test_crud.py b/app/tests/web/db/test_crud.py index b49fefd..dbe3dec 100644 --- a/app/tests/web/db/test_crud.py +++ b/app/tests/web/db/test_crud.py @@ -277,13 +277,14 @@ def test_upsert_group(test_data, db_session): assert db_session.query(models.Group).count() == 4 - repeatable_params = ["desc 1", "orch.yaml", "sheet.yaml", {"read": ["all"]}, ["example.com"]] + repeatable_params = ["desc 1", "orch.yaml", "sheet.yaml", "service_account_email@example.com", {"read": ["all"]}, ["example.com"]] assert (g1 := crud.upsert_group(db_session, "spaceship", *repeatable_params)) is not None assert g1.id == "spaceship" assert g1.description == "desc 1" assert g1.orchestrator == "orch.yaml" assert g1.orchestrator_sheet == "sheet.yaml" + assert g1.service_account_email == "service_account_email@example.com" assert g1.permissions == {"read": ["all"]} assert g1.domains == ["example.com"] assert len(g1.users) == 2 diff --git a/app/web/db/crud.py b/app/web/db/crud.py index c5ea771..5e1f976 100644 --- a/app/web/db/crud.py +++ b/app/web/db/crud.py @@ -129,15 +129,16 @@ def get_user_groups(email: str) -> list[str]: # --------------- INIT User-Groups -def upsert_group(db: Session, group_name: str, description: str, orchestrator: str, orchestrator_sheet: str, permissions: dict, domains: list) -> models.Group: +def upsert_group(db: Session, group_name: str, description: str, orchestrator: str, orchestrator_sheet: str, service_account_email:str, permissions: dict, domains: list) -> models.Group: db_group = db.query(models.Group).filter(models.Group.id == group_name).first() if db_group is None: - db_group = models.Group(id=group_name, description=description, orchestrator=orchestrator, orchestrator_sheet=orchestrator_sheet, permissions=permissions, domains=domains) + db_group = models.Group(id=group_name, description=description, orchestrator=orchestrator, orchestrator_sheet=orchestrator_sheet, service_account_email=service_account_email, permissions=permissions, domains=domains) db.add(db_group) else: db_group.description = description db_group.orchestrator = orchestrator db_group.orchestrator_sheet = orchestrator_sheet + db_group.service_account_email = service_account_email db_group.permissions = permissions db_group.domains = domains db.commit() @@ -180,7 +181,8 @@ def upsert_user_groups(db: Session): import json # upsert groups and save a map of groupid -> dbobject for group_id, g in ug.groups.items(): - upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, []))) + logger.debug(f"GROUP {group_id} => {g.service_account_email}") + upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, g.service_account_email, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, []))) db_groups: dict[str, models.Group] = {g.id: g for g in db.query(models.Group).all()} # integrity checks diff --git a/app/web/db/user_state.py b/app/web/db/user_state.py index dc7b112..a97df37 100644 --- a/app/web/db/user_state.py +++ b/app/web/db/user_state.py @@ -39,7 +39,7 @@ class UserState: ) for group in self.user_groups: if not group.permissions: continue - self._permissions[group.id] = GroupInfo(**group.permissions, description=group.description) + self._permissions[group.id] = GroupInfo(**group.permissions, description=group.description, service_account_email=group.service_account_email) return self._permissions @property diff --git a/docker-compose.yml b/docker-compose.yml index 7c5cbd9..420ef6d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,8 +11,8 @@ services: restart: always env_file: .env.prod environment: - CELERY_BROKER_URL: redis://:${REDIS_PASSWORD}@redis:6379/0 ENVIRONMENT_FILE: .env.prod + REDIS_HOSTNAME: redis ports: - "127.0.0.1:8004:8000" #TODO: should prod have the --reload flag? @@ -42,7 +42,7 @@ services: - /var/run/docker.sock:/var/run/docker.sock - crawls:/crawls # BROWSERTRIX_HOME_HOST:BROWSERTRIX_HOME_CONTAINER, do not change /crawls environment: - CELERY_BROKER_URL: redis://:${REDIS_PASSWORD}@redis:6379/0 + REDIS_HOSTNAME: redis ENVIRONMENT_FILE: .env.prod WACZ_ENABLE_DOCKER: 1 # Enable calling docker from this container BROWSERTRIX_HOME_HOST: auto-archiver-api_crawls