diff --git a/.gitignore b/.gitignore index 1df91d8..ec92013 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ local_archive local_archive_test *db-wal *db-shm -copy-files.sh \ No newline at end of file +copy-files.sh +.pytest_cache \ No newline at end of file diff --git a/README.md b/README.md index 8582798..4a6e688 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,55 @@ An api that uses celery workers to process URL archive requests via [bellingcat/auto-archiver](https://github.com/bellingcat/auto-archiver), it allows authentication via Google OAuth Apps and enables CORS, everything runs on docker but development can be done without docker (except for redis). +## User, Domains, Groups, and permissions management +there are 2 ways to access the API +1. via an API token which has full control/privileges to archive/search +2. via a Google Auth token which goes through the user access model + +#### User access model +The permissions are defined solely via the `user-groups.yaml` file +- users belong to groups which determine their access level/quotas/orchestration setup + - users are assigned to groups explicitly (via email) + - users are assigned to groups implicitly (via email domains) + - domains are associated to groups + - users that are not explicitly or implicitly in the system belong to the `default` group, restrict their permissions if you do not wish them to be able to search/archive + - if a user is assigned to one group which is not explicitly defined, a warning will be thrown, it may be necessary to do that if you discontinue a given group but the database still has entries for it and so +- groups determine + - which orchestrator to use for single URL archives and for spreadsheet archives + - a set of permissions + - `read` can be [`all`], [] or a comma separated list of group names, meaning people in this group can access either all, none, or those belonging to explicitly listed groups. + - the group itself must be included in the list, otherwise the user cannot search archives of that group + - `archive_url` a boolean that enables the user to archive links in this group + - `archive_sheet` a boolean that enables the user to archive spreadsheets + - `sheet_frequency` a list of options for the sheet archiving frequency, currently max permissions is `["hourly", "daily"]` + - `max_sheets` defines the maximum amount of spreadsheets someone can have in total (`-1` means no limit) + - `max_archive_lifespan_months` defines the lifespan of an archive before being deleted from S3, users will be notified 1 month in advance with instructions to download TODO + - `monthly_urls` how many total URLs someone can archive per month (`-1` means no limit) + - `monthly_mbs` how many MBs of data someone can archive per month (`-1` means no limit) + - `priority` one of `high` or `low`, this will be used to give archiving priority + - group names are all lower-case + + +To figure out: +- workshop participants should be able to test this. `public` +- how can people bring their own storage/api keys? +- how to implement lifespan of archives? 6 months lifespan example. they should expect a way to download all archives locally. +- how to deactivate unused sheets and notify? +- how to mark URLs for deletion, and then do a hard delete? +- what actions can people take: + - URL (P=needs permission, O=open) + - P archive + - P search + - O find own links + - DISABLED find by id + - P delete archive (soft) + - Sheets + - P create a new sheet + - O get my sheets + - O delete a sheet + - P archive a sheet now + + ## Development http://localhost:8004 diff --git a/src/db/crud.py b/src/db/crud.py index c48f6ae..1368bc0 100644 --- a/src/db/crud.py +++ b/src/db/crud.py @@ -8,6 +8,7 @@ from datetime import datetime, timedelta from core.config import ALLOW_ANY_EMAIL from db.database import get_db from shared.settings import get_settings +from shared.user_groups import UserGroups from . import models, schemas import yaml @@ -202,13 +203,7 @@ def upsert_user_groups(db: Session): logger.debug("Updating user-groups configuration.") filename = get_settings().USER_GROUPS_FILENAME - # read yaml safely - try: - with open(filename) as inf: - user_groups_yaml = yaml.safe_load(inf) - except Exception as e: - logger.error(f"could not open user groups filename {filename}: {e}") - raise e + ug = UserGroups(filename) # delete all user-groups relationships db.query(models.association_table_user_groups).delete() @@ -219,33 +214,26 @@ def upsert_user_groups(db: Session): # create a map of group_id -> domains and another of domain -> groups group_domains = defaultdict(set) domain_groups = defaultdict(list) - for domain, explicit_groups in user_groups_yaml.get("domains", {}).items(): + for domain, explicit_groups in ug.domains.items(): domain_groups[domain] = list(set(explicit_groups)) for group in explicit_groups: group_domains[group].add(domain) - + import json # upsert groups and save a map of groupid -> dbobject - for group_id, g in user_groups_yaml.get("groups", {}).items(): - upsert_group(db, group_id, g["description"], g["orchestrator"], g["orchestrator_sheet"], g["permissions"], list(group_domains.get(group_id, []))) + for group_id, g in ug.groups.items(): + upsert_group(db, group_id, g.description, g.orchestrator, g.orchestrator_sheet, json.loads(g.permissions.model_dump_json()), list(group_domains.get(group_id, []))) db_groups: dict[str, models.Group] = {g.id: g for g in db.query(models.Group).all()} # integrity checks for group_in_domains in group_domains: if group_in_domains not in db_groups: logger.error(f"[CONFIG] Group '{group_in_domains}' does not exist in the database: domains setting will not work.") - if group_in_domains not in user_groups_yaml.get("groups", {}): - logger.error(f"[CONFIG] Group '{group_in_domains}' does not exist in the config file: domains setting will not work.") # reinsert users in their EXPLICITLY DEFINED groups # domain groups are check live, as there may be new users that are not explicitly registered but belong to a domain - for email, explicit_groups in user_groups_yaml.get("users", {}).items(): + for email, explicit_groups in ug.users.items(): explicit_groups = explicit_groups or [] - email = email.lower().strip() - if '@' not in email: - logger.error(f'[CONFIG] Invalid user email {email}, skipping.') - continue - - logger.info(f"{display_email_pii(email)} => {explicit_groups}") + logger.info(f"EXPLICIT {display_email_pii(email)} => {explicit_groups}") # upsert active user db_user = upsert_user(db, email, active=True) diff --git a/src/db/user_state.py b/src/db/user_state.py index 691574a..99c68ac 100644 --- a/src/db/user_state.py +++ b/src/db/user_state.py @@ -32,14 +32,12 @@ class UserState: @property def allowed_frequencies(self): - if not hasattr(self, '_allowed_frequencies'): - self._allowed_frequencies = set() + if not hasattr(self, '_sheet_frequency'): + self._sheet_frequency = set() for group in self.user_groups: if not group.permissions: continue - self._allowed_frequencies.add(group.permissions.get("allowed_frequency", None)) - if "hourly" in self._allowed_frequencies: - self._allowed_frequencies.add("daily") - return self._allowed_frequencies + self._sheet_frequency.update(group.permissions.get("sheet_frequency", None)) + return self._sheet_frequency @property def sheet_quota(self): @@ -51,11 +49,11 @@ class UserState: self._sheet_quota = 0 for group in self.user_groups: if not group.permissions: continue - active_sheets = group.permissions.get("active_sheets", 0) - if active_sheets == -1: + max_sheets = group.permissions.get("max_sheets", 0) + if max_sheets == -1: self._sheet_quota = -1 return self._sheet_quota - self._sheet_quota = max(self._sheet_quota, active_sheets) + self._sheet_quota = max(self._sheet_quota, max_sheets) return self._sheet_quota @@ -72,16 +70,16 @@ class UserState: return user_sheets < self.sheet_quota - def has_quota_monthly_urls(self) -> bool: + def has_quota_max_monthly_urls(self) -> bool: """ checks if a user has reached their monthly url quota """ quota = 0 for group in self.user_groups: if not group.permissions: continue - monthly_urls = group.permissions.get("monthly_urls", 0) - if monthly_urls == -1: return True - quota = max(quota, monthly_urls) + max_monthly_urls = group.permissions.get("max_monthly_urls", 0) + if max_monthly_urls == -1: return True + quota = max(quota, max_monthly_urls) current_month = datetime.now().month current_year = datetime.now().year @@ -93,16 +91,16 @@ class UserState: return user_urls < quota - def has_quota_monthly_mbs(self) -> bool: + def has_quota_max_monthly_mbs(self) -> bool: """ checks if a user has reached their monthly mb quota """ quota = 0 for group in self.user_groups: if not group.permissions: continue - monthly_mbs = group.permissions.get("monthly_mbs", 0) - if monthly_mbs == -1: return True - quota = max(quota, monthly_mbs) + max_monthly_mbs = group.permissions.get("max_monthly_mbs", 0) + if max_monthly_mbs == -1: return True + quota = max(quota, max_monthly_mbs) current_month = datetime.now().month current_year = datetime.now().year diff --git a/src/endpoints/default.py b/src/endpoints/default.py index efb6cf6..62d1174 100644 --- a/src/endpoints/default.py +++ b/src/endpoints/default.py @@ -49,8 +49,8 @@ def get_user_groups( "groups": user.user_groups_names, "allowedFrequencies": list(user.allowed_frequencies), "sheet_quota": user.sheet_quota, - "monthly_urls": user.monthly_urls, - "monthly_mbs": user.monthly_mbs, + "max_monthly_urls": user.max_monthly_urls, #TODO + "max_monthly_mbs": user.max_monthly_mbs, # TODO #TODO: should this return }) diff --git a/src/shared/user_groups.py b/src/shared/user_groups.py new file mode 100644 index 0000000..d4ee02f --- /dev/null +++ b/src/shared/user_groups.py @@ -0,0 +1,124 @@ +import yaml +from loguru import logger +from pydantic import BaseModel, field_validator, Field, model_validator +from typing import Dict, List, Set +from typing_extensions import Self + + +class UserGroups: + def __init__(self, filename): + user_groups = self.read_yaml(filename) + self.validate_and_load(user_groups) + + def read_yaml(self, user_groups_filename): + # read yaml safely + with open(user_groups_filename) as inf: + try: + return yaml.safe_load(inf) + except yaml.YAMLError as e: + logger.error(f"could not open user groups filename {user_groups_filename}: {e}") + raise e + + def validate_and_load(self, user_groups): + try: + configs = UserGroupModel(**user_groups) + self.users = configs.users + self.domains = configs.domains + self.groups = configs.groups + except Exception as e: + logger.error(f"Validation error: {e}") + raise e + + +class GroupPermissions(BaseModel): + read: Set[str] = Field(default_factory=list) + archive_url: bool = False + archive_sheet: bool = False + sheet_frequency: Set[str] = Field(default_factory=list) + max_sheets: int = 0 + max_archive_lifespan_months: int = 12 + max_monthly_urls: int = 0 + max_monthly_mbs: int = 0 + priority: str = "low" + + @field_validator('max_sheets', 'max_archive_lifespan_months', 'max_monthly_urls', 'max_monthly_mbs', mode='before') + def validate_max_values(cls, v): + if v < -1: + raise ValueError("max_* values should be positive integers or -1 (for no limit).") + return v + + @field_validator('sheet_frequency', mode='before') + def validate_sheet_frequency(cls, v): + if not v: + raise ValueError("sheet_frequency should have at least one value.") + allowed = ["daily", "hourly"] + for k in v: + if k not in allowed: + raise ValueError(f"Invalid sheet frequency: '{k}', expected one of {allowed}") + return v + + @field_validator('priority', mode='before') + def validate_priority(cls, v): + v = v.lower() + if v not in ["low", "high"]: + raise ValueError("priority must be either 'low' or 'high'.") + return v + + +class GroupModel(BaseModel): + description: str + orchestrator: str + orchestrator_sheet: str + permissions: GroupPermissions + + +class UserGroupModel(BaseModel): + users: Dict[str, List[str]] = Field(default_factory=dict) + domains: Dict[str, List[str]] = Field(default_factory=dict) + groups: Dict[str, GroupModel] = Field(default_factory=dict) + + @field_validator('users', mode='before') + @classmethod + def validate_emails(cls, v): + for email in v.keys(): + if '@' not in email: + raise ValueError(f"Invalid user, it should be an address: {email}") + if not v[email]: + raise ValueError(f"User {email} has no explicitly listed groups, only include them here if they should be in a group.") + return {k.lower().strip(): list(set([g.lower().strip() for g in v])) for k, v in v.items()} + + @field_validator('domains', mode='before') + @classmethod + def validate_domains(cls, v): + for domain, members in v.items(): + if '.' not in domain: + raise ValueError(f"Invalid domain, it should contain a dot: {domain}") + if not members: + raise ValueError(f"Domain {domain} should have at least one member.") + return {k.lower().strip(): list(set([g.lower().strip() for g in v])) for k, v in v.items()} + + @field_validator('groups', mode='before') + @classmethod + def validate_groups(cls, v): + if "default" not in v.keys(): + raise ValueError("Please include a 'default' group.") + if "all" in v.keys(): + raise ValueError("'all' is a reserved group name.") + for group in v.keys(): + if not group == group.lower(): + raise ValueError(f"Group names should be lowercase: {group}") + return v + + @model_validator(mode='after') + def check_groups_consistency(self) -> Self: + groups_in_domains = set([g for domain in self.domains for g in self.domains[domain]]) + groups_in_users = set([g for user in self.users for g in self.users[user]]) + configured_groups = set(self.groups.keys()) + + # groups mentioned in domains and users should be defined, but this is not a ValueError since historical DB data may require it + if groups_in_domains - configured_groups: + logger.warning(f"These groups are associated to DOMAINS but not defined in the GROUPS section, the domains settings may not work as expected: {groups_in_domains - configured_groups}") + if groups_in_users - configured_groups: + logger.warning(f"These groups are associated to USERS but not defined in the GROUPS section, the users settings may not work as expected: {groups_in_users - configured_groups}") + + return self diff --git a/src/tests/db/test_crud.py b/src/tests/db/test_crud.py index a465ee2..79c6894 100644 --- a/src/tests/db/test_crud.py +++ b/src/tests/db/test_crud.py @@ -57,8 +57,8 @@ def test_data(db_session): assert db_session.query(models.Group).count() == 0 from db import crud crud.upsert_user_groups(db_session) - assert db_session.query(models.Group).count() == 3 - assert db_session.query(models.User).count() == 4 + assert db_session.query(models.Group).count() == 4 + assert db_session.query(models.User).count() == 3 def test_get_archive(test_data, db_session): @@ -263,10 +263,10 @@ def test_count_archive_urls(test_data, db_session): def test_count_users(test_data, db_session): from db import crud - assert crud.count_users(db_session) == 4 + assert crud.count_users(db_session) == 3 db_session.query(models.User).filter(models.User.email == "rick@example.com").delete() db_session.commit() - assert crud.count_users(db_session) == 3 + assert crud.count_users(db_session) == 2 def test_count_by_users_since(test_data, db_session): @@ -313,7 +313,7 @@ def test_is_active_user(test_data, db_session): assert crud.is_active_user(db_session, "ANYONE@birdy.com") == True assert crud.is_active_user(db_session, "rick@example.com") == True assert crud.is_active_user(db_session, "RICK@example.com") == True - assert crud.is_active_user(db_session, "summer@herself.com") == True + assert crud.is_active_user(db_session, "summer@herself.com") == False assert crud.is_active_user(db_session, "rick@not-in-groups.com") == False @@ -369,7 +369,7 @@ def test_get_group(test_data, db_session): def test_create_or_get_user(test_data, db_session): from db import crud - assert db_session.query(models.User).count() == 4 + assert db_session.query(models.User).count() == 3 # already exists assert (u1 := crud.create_or_get_user(db_session, "rick@example.com")) is not None @@ -386,13 +386,13 @@ def test_create_or_get_user(test_data, db_session): assert u3.email == "not-active@example.com" assert u3.is_active == False - assert db_session.query(models.User).count() == 6 + assert db_session.query(models.User).count() == 5 def test_upsert_group(test_data, db_session): from db import crud - assert db_session.query(models.Group).count() == 3 + assert db_session.query(models.Group).count() == 4 repeatable_params = ["desc 1", "orch.yaml", "sheet.yaml", {"read": ["all"]}, ["example.com"]] @@ -415,7 +415,7 @@ def test_upsert_group(test_data, db_session): assert g3.id == "this-is-a-new-group" assert len(g3.users) == 0 - assert db_session.query(models.Group).count() == 4 + assert db_session.query(models.Group).count() == 5 def test_upsert_user_groups(db_session): diff --git a/src/tests/endpoints/test_default.py b/src/tests/endpoints/test_default.py index 9455bcb..fa371d1 100644 --- a/src/tests/endpoints/test_default.py +++ b/src/tests/endpoints/test_default.py @@ -128,7 +128,7 @@ async def test_prometheus_metrics(test_data, client_with_token, get_settings): assert 'disk_utilization{type="database"}' in r2.text assert 'database_metrics{query="count_archives"} 100.0' in r2.text assert 'database_metrics{query="count_archive_urls"} 1000.0' in r2.text - assert 'database_metrics{query="count_users"} 4.0' in r2.text + assert 'database_metrics{query="count_users"} 3.0' in r2.text assert 'database_metrics_counter_total{query="count_by_user",user="rick@example.com"} 34.0' in r2.text assert 'database_metrics_counter_total{query="count_by_user",user="morty@example.com"} 33.0' in r2.text assert 'database_metrics_counter_total{query="count_by_user",user="jerry@example.com"} 33.0' in r2.text @@ -139,7 +139,7 @@ async def test_prometheus_metrics(test_data, client_with_token, get_settings): r3 = client_with_token.get("/metrics") assert 'database_metrics{query="count_archives"} 100.0' in r3.text assert 'database_metrics{query="count_archive_urls"} 1000.0' in r3.text - assert 'database_metrics{query="count_users"} 4.0' in r3.text + assert 'database_metrics{query="count_users"} 3.0' in r3.text assert 'database_metrics_counter_total{query="count_by_user",user="rick@example.com"} 34.0' in r3.text assert 'database_metrics_counter_total{query="count_by_user",user="morty@example.com"} 33.0' in r3.text assert 'database_metrics_counter_total{query="count_by_user",user="jerry@example.com"} 33.0' in r3.text diff --git a/src/tests/endpoints/test_url.py b/src/tests/endpoints/test_url.py index 5cccbf2..b23b07e 100644 --- a/src/tests/endpoints/test_url.py +++ b/src/tests/endpoints/test_url.py @@ -5,7 +5,6 @@ from db.schemas import ArchiveCreate, TaskResult def test_archive_url_unauthenticated(client, test_no_auth): test_no_auth(client.post, "/url/archive") - test_no_auth(client.get, "/url/archive") @patch("worker.main.create_archive_task.delay", return_value=TaskResult(id="123-456-789", status="PENDING", result="")) @@ -118,11 +117,11 @@ def test_latest(client_with_auth, db_session): assert len(response.json()) == 2 -def test_lookup_unauthenticated(client, test_no_auth): - test_no_auth(client.get, "/url/123-456-789") - - # # TODO: find out where/if this is used, tests are also disabled + +# def test_lookup_unauthenticated(client, test_no_auth): +# test_no_auth(client.get, "/url/123-456-789") + # def test_lookup(client_with_auth, db_session): # response = client_with_auth.get("/url/lookup-123-456-789") # assert response.status_code == 404 diff --git a/src/tests/user-groups.test.yaml b/src/tests/user-groups.test.yaml index c1160c2..b2abf43 100644 --- a/src/tests/user-groups.test.yaml +++ b/src/tests/user-groups.test.yaml @@ -7,8 +7,8 @@ users: - spaceship jerry@example.com: - the-jerrys-club - summer@herself.com: - badyemail.com: + # summer@herself.com: + # badyemail.com: domains: example.com: @@ -32,27 +32,53 @@ groups: orchestrator_sheet: tests/orchestration.test.yaml permissions: read: ["all"] - active_sheets: -1 - monthly_urls: -1 - monthly_mbs: -1 - allowed_frequency: "hourly" + archive_url: true + archive_sheet: true + sheet_frequency: ["hourly", "daily"] + max_sheets: -1 + max_archive_lifespan_months: -1 + max_monthly_urls: -1 + max_monthly_mbs: -1 + priority: "high" interdimensional: description: "Interdimensional travelers" orchestrator: tests/orchestration.test.yaml orchestrator_sheet: tests/orchestration.test.yaml permissions: read: ["interdimensional", "animated-characters"] - active_sheets: 5 - monthly_urls: 1000 - monthly_mbs: 1000 - allowed_frequency: "hourly" + archive_url: true + archive_sheet: true + sheet_frequency: ["hourly", "daily"] + max_sheets: 5 + max_archive_lifespan_months: 12 + max_monthly_urls: 1000 + max_monthly_mbs: 1000 + priority: "high" animated-characters: description: "Animated characters" orchestrator: tests/orchestration.test.yaml orchestrator_sheet: tests/orchestration.test.yaml permissions: read: ["animated-characters"] - active_sheets: 1 - monthly_urls: 2 - monthly_mbs: 10 - allowed_frequency: "daily" \ No newline at end of file + archive_url: true + archive_sheet: true + sheet_frequency: ["daily"] + max_sheets: 1 + max_archive_lifespan_months: 12 + max_monthly_urls: 2 + max_monthly_mbs: 10 + priority: "low" + default: + description: "Public access" + orchestrator: tests/orchestration.test.yaml + orchestrator_sheet: tests/orchestration.test.yaml + permissions: + read: [] + archive_url: true + archive_sheet: true + sheet_frequency: ["daily"] + max_sheets: 1 + max_archive_lifespan_months: 12 + max_monthly_urls: 1 + max_monthly_mbs: 1 + priority: "low" \ No newline at end of file