diff --git a/.claude/worktrees/agent-a090b6ec b/.claude/worktrees/agent-a090b6ec new file mode 160000 index 0000000..a75d81a --- /dev/null +++ b/.claude/worktrees/agent-a090b6ec @@ -0,0 +1 @@ +Subproject commit a75d81a8d64e49d8a1fdd140e814fb24bebb6111 diff --git a/.claude/worktrees/agent-a11dddbd b/.claude/worktrees/agent-a11dddbd new file mode 160000 index 0000000..8d97b26 --- /dev/null +++ b/.claude/worktrees/agent-a11dddbd @@ -0,0 +1 @@ +Subproject commit 8d97b263ece69367d1061129036370bcf43d14cf diff --git a/.claude/worktrees/agent-a19eb2f7 b/.claude/worktrees/agent-a19eb2f7 new file mode 160000 index 0000000..d98513b --- /dev/null +++ b/.claude/worktrees/agent-a19eb2f7 @@ -0,0 +1 @@ +Subproject commit d98513bf55e0f7eac882f449a3d1622fe1394eb2 diff --git a/.claude/worktrees/agent-a1a93bb2 b/.claude/worktrees/agent-a1a93bb2 new file mode 160000 index 0000000..6ab411c --- /dev/null +++ b/.claude/worktrees/agent-a1a93bb2 @@ -0,0 +1 @@ +Subproject commit 6ab411cda230324bc12c5b65ecec5921b17aaa1a diff --git a/.claude/worktrees/agent-a1ab7cd2/.claude/worktrees/agent-a30fab90/.claude/worktrees/agent-a3b639bf/.claude/worktrees/agent-a9511329/.claude/worktrees/agent-aed10f3e/.claude/worktrees/agent-a44a25be b/.claude/worktrees/agent-a1ab7cd2/.claude/worktrees/agent-a30fab90/.claude/worktrees/agent-a3b639bf/.claude/worktrees/agent-a9511329/.claude/worktrees/agent-aed10f3e/.claude/worktrees/agent-a44a25be new file mode 160000 index 0000000..0ff9edc --- /dev/null +++ b/.claude/worktrees/agent-a1ab7cd2/.claude/worktrees/agent-a30fab90/.claude/worktrees/agent-a3b639bf/.claude/worktrees/agent-a9511329/.claude/worktrees/agent-aed10f3e/.claude/worktrees/agent-a44a25be @@ -0,0 +1 @@ +Subproject commit 0ff9edc6c1ca2679840bdfc95604b8615537eb0a diff --git a/.claude/worktrees/agent-a2637f83 b/.claude/worktrees/agent-a2637f83 new file mode 160000 index 0000000..3d3c57f --- /dev/null +++ b/.claude/worktrees/agent-a2637f83 @@ -0,0 +1 @@ +Subproject commit 3d3c57fff27abf35950529d113042ea6a4f2b820 diff --git a/.claude/worktrees/agent-a27c3406 b/.claude/worktrees/agent-a27c3406 new file mode 160000 index 0000000..61a9d52 --- /dev/null +++ b/.claude/worktrees/agent-a27c3406 @@ -0,0 +1 @@ +Subproject commit 61a9d527ee67fb07db46fdfb5db2acb9023416e2 diff --git a/.claude/worktrees/agent-a2e54e09 b/.claude/worktrees/agent-a2e54e09 new file mode 160000 index 0000000..d0396bb --- /dev/null +++ b/.claude/worktrees/agent-a2e54e09 @@ -0,0 +1 @@ +Subproject commit d0396bb3848306fced1e050254b04343dbdc3e60 diff --git a/.claude/worktrees/agent-a2fe7ff3 b/.claude/worktrees/agent-a2fe7ff3 new file mode 160000 index 0000000..223c23e --- /dev/null +++ b/.claude/worktrees/agent-a2fe7ff3 @@ -0,0 +1 @@ +Subproject commit 223c23e6720e74c31e565e50635162bb830e8be1 diff --git a/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-a1113d5a b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-a1113d5a new file mode 160000 index 0000000..1013caf --- /dev/null +++ b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-a1113d5a @@ -0,0 +1 @@ +Subproject commit 1013caf843739dfb0ae2676cd3d9190754e84984 diff --git a/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-ad901ba0 b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-ad901ba0 new file mode 160000 index 0000000..abfc2f8 --- /dev/null +++ b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-ad901ba0 @@ -0,0 +1 @@ +Subproject commit abfc2f8319807e979448eff7b19f3b06bc42d95f diff --git a/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-adad8c10 b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-adad8c10 new file mode 160000 index 0000000..95ee768 --- /dev/null +++ b/.claude/worktrees/agent-a309b50b/.claude/worktrees/agent-adad8c10 @@ -0,0 +1 @@ +Subproject commit 95ee76826691012f7fc7c9be30a20f2ec173bda0 diff --git a/.claude/worktrees/agent-a5bf4f07 b/.claude/worktrees/agent-a5bf4f07 new file mode 160000 index 0000000..43aeb89 --- /dev/null +++ b/.claude/worktrees/agent-a5bf4f07 @@ -0,0 +1 @@ +Subproject commit 43aeb8985d722d20027e57c3305f57eee16a7691 diff --git a/.claude/worktrees/agent-a5d8d812 b/.claude/worktrees/agent-a5d8d812 new file mode 160000 index 0000000..6303308 --- /dev/null +++ b/.claude/worktrees/agent-a5d8d812 @@ -0,0 +1 @@ +Subproject commit 630330820702401f6a5f1bc7d02b669bbbf523c2 diff --git a/.claude/worktrees/agent-a6700ee2 b/.claude/worktrees/agent-a6700ee2 new file mode 160000 index 0000000..d8a54f2 --- /dev/null +++ b/.claude/worktrees/agent-a6700ee2 @@ -0,0 +1 @@ +Subproject commit d8a54f2c164a477c9afbe86f55d00db3d1f16b08 diff --git a/.claude/worktrees/agent-a7f84823 b/.claude/worktrees/agent-a7f84823 new file mode 160000 index 0000000..21d5551 --- /dev/null +++ b/.claude/worktrees/agent-a7f84823 @@ -0,0 +1 @@ +Subproject commit 21d5551aa42cf4e12efd68ba3dbfc706f0e86cb6 diff --git a/.claude/worktrees/agent-abce7711 b/.claude/worktrees/agent-abce7711 new file mode 160000 index 0000000..c595fef --- /dev/null +++ b/.claude/worktrees/agent-abce7711 @@ -0,0 +1 @@ +Subproject commit c595fef148d4820307d51df70ccae7195cac9462 diff --git a/.claude/worktrees/agent-ac81d6ab b/.claude/worktrees/agent-ac81d6ab new file mode 160000 index 0000000..cae714b --- /dev/null +++ b/.claude/worktrees/agent-ac81d6ab @@ -0,0 +1 @@ +Subproject commit cae714b4887336af12643d1e7ddec36bd40a74c5 diff --git a/.claude/worktrees/agent-ad7ef8d3 b/.claude/worktrees/agent-ad7ef8d3 new file mode 160000 index 0000000..792ac8d --- /dev/null +++ b/.claude/worktrees/agent-ad7ef8d3 @@ -0,0 +1 @@ +Subproject commit 792ac8d54bb6cbd6c1059cc3b4870cf43f5b14fa diff --git a/.claude/worktrees/agent-ae6d1042/.claude/worktrees/agent-a0a11e9a b/.claude/worktrees/agent-ae6d1042/.claude/worktrees/agent-a0a11e9a new file mode 160000 index 0000000..a639cde --- /dev/null +++ b/.claude/worktrees/agent-ae6d1042/.claude/worktrees/agent-a0a11e9a @@ -0,0 +1 @@ +Subproject commit a639cdea02bbe724e636d79df28d64576cff02a5 diff --git a/.claude/worktrees/agent-aefa9208 b/.claude/worktrees/agent-aefa9208 new file mode 160000 index 0000000..a2347f1 --- /dev/null +++ b/.claude/worktrees/agent-aefa9208 @@ -0,0 +1 @@ +Subproject commit a2347f150a7f389db4794c5c19944163bcccbc4b diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md index 2b6156d..7b691dc 100644 --- a/.planning/REQUIREMENTS.md +++ b/.planning/REQUIREMENTS.md @@ -173,11 +173,11 @@ Requirements for initial release. Each maps to roadmap phases. ### OSINT/Recon — Frontend & JS Leaks -- [ ] **RECON-JS-01**: JavaScript source map extraction and scanning -- [ ] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars -- [ ] **RECON-JS-03**: Exposed .env file scanning on web servers -- [ ] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning -- [ ] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning +- [x] **RECON-JS-01**: JavaScript source map extraction and scanning +- [x] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars +- [x] **RECON-JS-03**: Exposed .env file scanning on web servers +- [x] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning +- [x] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning ### OSINT/Recon — Log Aggregators diff --git a/.planning/phases/14-osint_ci_cd_logs_web_archives_frontend_leaks/14-03-SUMMARY.md b/.planning/phases/14-osint_ci_cd_logs_web_archives_frontend_leaks/14-03-SUMMARY.md new file mode 100644 index 0000000..8805e75 --- /dev/null +++ b/.planning/phases/14-osint_ci_cd_logs_web_archives_frontend_leaks/14-03-SUMMARY.md @@ -0,0 +1,152 @@ +--- +phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks +plan: 03 +subsystem: recon +tags: [sourcemaps, webpack, dotenv, swagger, openapi, vercel, netlify, frontend-leaks] + +requires: + - phase: 10-osint-code-hosting + provides: "ReconSource interface, Client, BuildQueries, LimiterRegistry patterns" + - phase: 13-osint-package-registries + provides: "RegisterAll with 40 sources baseline" +provides: + - "SourceMapSource for probing .map files for original source with API keys" + - "WebpackSource for scanning JS bundles for inlined env vars" + - "EnvLeakSource for detecting exposed .env files on web servers" + - "SwaggerSource for finding API keys in OpenAPI example/default fields" + - "DeployPreviewSource for scanning Vercel/Netlify previews for leaked env vars" + - "RegisterAll extended to 45 sources" +affects: [14-04, 14-05, 15, 16] + +tech-stack: + added: [] + patterns: ["Multi-path probing pattern for credentialless web asset scanning"] + +key-files: + created: + - pkg/recon/sources/sourcemap.go + - pkg/recon/sources/sourcemap_test.go + - pkg/recon/sources/webpack.go + - pkg/recon/sources/webpack_test.go + - pkg/recon/sources/envleak.go + - pkg/recon/sources/envleak_test.go + - pkg/recon/sources/swagger.go + - pkg/recon/sources/swagger_test.go + - pkg/recon/sources/deploypreview.go + - pkg/recon/sources/deploypreview_test.go + modified: + - pkg/recon/sources/register.go + - pkg/recon/sources/register_test.go + - pkg/recon/sources/integration_test.go + +key-decisions: + - "Multi-path probing: each source probes multiple common paths per query rather than single endpoint" + - "Nil Limiters in tests: skip rate limiting in httptest to keep tests fast (<1s)" + - "RegisterAll extended to 45 sources (40 Phase 10-13 + 5 Phase 14 frontend leak sources)" + +patterns-established: + - "Multi-path probing pattern: sources that probe multiple common URL paths per domain/query hint" + - "Regex-based content scanning: compile-time regex patterns for detecting secrets in response bodies" + +requirements-completed: [RECON-JS-01, RECON-JS-02, RECON-JS-03, RECON-JS-04, RECON-JS-05] + +duration: 5min +completed: 2026-04-06 +--- + +# Phase 14 Plan 03: Frontend Leak Sources Summary + +**Five credentialless frontend leak scanners: source maps, webpack bundles, exposed .env files, Swagger docs, and deploy preview environments** + +## Performance + +- **Duration:** 5 min +- **Started:** 2026-04-06T10:13:15Z +- **Completed:** 2026-04-06T10:18:15Z +- **Tasks:** 2 +- **Files modified:** 13 + +## Accomplishments +- SourceMapSource probes 7 common .map paths, parses JSON sourcesContent for API key patterns +- WebpackSource scans JS bundles for NEXT_PUBLIC_/REACT_APP_/VITE_ prefixed env var leaks +- EnvLeakSource probes 8 common .env paths with multiline regex matching for secret key=value lines +- SwaggerSource parses OpenAPI JSON docs for API keys in example/default fields +- DeployPreviewSource scans Vercel/Netlify preview URLs for __NEXT_DATA__ and env var patterns +- RegisterAll extended from 40 to 45 sources + +## Task Commits + +Each task was committed atomically: + +1. **Task 1: SourceMapSource, WebpackSource, EnvLeakSource + tests** - `b57bd5e` (feat) +2. **Task 2: SwaggerSource, DeployPreviewSource + tests** - `7d8a418` (feat) +3. **RegisterAll wiring** - `0a8be81` (feat) + +## Files Created/Modified +- `pkg/recon/sources/sourcemap.go` - Source map file probing and content scanning +- `pkg/recon/sources/sourcemap_test.go` - httptest-based tests for source map scanning +- `pkg/recon/sources/webpack.go` - Webpack/Vite bundle env var detection +- `pkg/recon/sources/webpack_test.go` - httptest-based tests for webpack scanning +- `pkg/recon/sources/envleak.go` - Exposed .env file detection +- `pkg/recon/sources/envleak_test.go` - httptest-based tests for .env scanning +- `pkg/recon/sources/swagger.go` - Swagger/OpenAPI doc API key extraction +- `pkg/recon/sources/swagger_test.go` - httptest-based tests for Swagger scanning +- `pkg/recon/sources/deploypreview.go` - Vercel/Netlify deploy preview scanning +- `pkg/recon/sources/deploypreview_test.go` - httptest-based tests for deploy preview scanning +- `pkg/recon/sources/register.go` - Extended RegisterAll to 45 sources +- `pkg/recon/sources/register_test.go` - Updated test expectations to 45 +- `pkg/recon/sources/integration_test.go` - Updated integration test count to 45 + +## Decisions Made +- Multi-path probing: each source probes multiple common URL paths per query rather than constructing real domain URLs (sources are lead generators) +- Nil Limiters in sweep tests: rate limiter adds 3s per path probe making tests take 20+ seconds; skip in unit tests, test rate limiting separately +- envKeyValuePattern uses (?im) multiline flag for proper line-anchored matching in .env file content + +## Deviations from Plan + +### Auto-fixed Issues + +**1. [Rule 1 - Bug] Fixed multiline regex in EnvLeakSource** +- **Found during:** Task 1 (EnvLeakSource tests) +- **Issue:** envKeyValuePattern used ^ anchor without (?m) multiline flag, failing to match lines in multi-line .env content +- **Fix:** Added (?m) flag to regex: `(?im)^[A-Z_]*(API[_]?KEY|SECRET|...)` +- **Files modified:** pkg/recon/sources/envleak.go +- **Verification:** TestEnvLeak_Sweep_ExtractsFindings passes +- **Committed in:** b57bd5e (Task 1 commit) + +**2. [Rule 1 - Bug] Removed unused imports in sourcemap.go** +- **Found during:** Task 1 (compilation) +- **Issue:** "fmt" and "strings" imported but unused +- **Fix:** Removed unused imports +- **Files modified:** pkg/recon/sources/sourcemap.go +- **Committed in:** b57bd5e (Task 1 commit) + +**3. [Rule 2 - Missing Critical] Extended RegisterAll and updated integration tests** +- **Found during:** After Task 2 (wiring sources) +- **Issue:** New sources needed registration in RegisterAll; existing tests hardcoded 40 source count +- **Fix:** Added 5 sources to RegisterAll, updated register_test.go and integration_test.go +- **Files modified:** pkg/recon/sources/register.go, register_test.go, integration_test.go +- **Committed in:** 0a8be81 + +--- + +**Total deviations:** 3 auto-fixed (2 bugs, 1 missing critical) +**Impact on plan:** All fixes necessary for correctness. No scope creep. + +## Issues Encountered +None beyond the auto-fixed deviations above. + +## User Setup Required +None - all five sources are credentialless. + +## Known Stubs +None - all sources are fully implemented with real scanning logic. + +## Next Phase Readiness +- 45 sources now registered in RegisterAll +- Frontend leak scanning vectors covered: source maps, webpack bundles, .env files, Swagger docs, deploy previews +- Ready for remaining Phase 14 plans (CI/CD log sources, web archive sources) + +--- +*Phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks* +*Completed: 2026-04-06* diff --git a/RESEARCH_REPORT.md b/RESEARCH_REPORT.md new file mode 100644 index 0000000..8ed53cb --- /dev/null +++ b/RESEARCH_REPORT.md @@ -0,0 +1,548 @@ +# API Key Scanner Market Research Report +**Date: April 4, 2026** + +--- + +## Table of Contents +1. [Existing Open-Source API Key Scanners](#1-existing-open-source-api-key-scanners) +2. [LLM-Specific API Key Tools](#2-llm-specific-api-key-tools) +3. [Top LLM API Providers (100+)](#3-top-llm-api-providers) +4. [API Key Patterns by Provider](#4-api-key-patterns-by-provider) +5. [Key Validation Approaches](#5-key-validation-approaches) +6. [Market Gaps & Opportunities](#6-market-gaps--opportunities) + +--- + +## 1. Existing Open-Source API Key Scanners + +### 1.1 TruffleHog +- **GitHub:** https://github.com/trufflesecurity/trufflehog +- **Stars:** ~25,500 +- **Language:** Go +- **Detectors:** 800+ secret types +- **Approach:** Detector-based (each detector is a small Go program for a specific credential type) +- **Detection methods:** + - Pattern matching via dedicated detectors + - Active verification against live APIs + - Permission/scope analysis (~20 credential types) +- **AI/LLM detectors confirmed:** OpenAI, OpenAI Admin Key, Anthropic +- **Scanning sources:** Git repos, GitHub orgs, S3 buckets, GCS, Docker images, Jenkins, Elasticsearch, Postman, Slack, local filesystems +- **Key differentiator:** Verification — not just "this looks like a key" but "this is an active key with these permissions" +- **Limitations:** + - Heavy/slow compared to regex-only scanners + - Not all 800+ detectors have verification + - LLM provider coverage still incomplete (no confirmed Cohere, Mistral, Groq detectors) + +### 1.2 Gitleaks +- **GitHub:** https://github.com/gitleaks/gitleaks +- **Stars:** ~25,800 +- **Language:** Go +- **Rules:** 150+ regex patterns in `gitleaks.toml` +- **Approach:** Regex pattern matching with optional entropy checks +- **Detection methods:** + - Regex patterns defined in TOML config + - Keyword matching + - Entropy thresholds + - Allowlists for false positive reduction +- **AI/LLM rules confirmed:** + - `anthropic-admin-api-key`: `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA` + - `anthropic-api-key`: `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA` + - `openai-api-key`: Updated to include `sk-proj-` and `sk-svcacct-` formats + - `cohere-api-token`: Keyword-based detection + - `huggingface-access-token`: `hf_[a-z]{34}` + - `huggingface-organization-api-token`: `api_org_[a-z]{34}` +- **Key differentiator:** Fast, simple, excellent as pre-commit hook +- **Limitations:** + - No active verification of detected keys + - Regex-only means higher false positive rate for generic patterns + - Limited LLM provider coverage beyond the 5 above +- **Note:** Gitleaks creator launched "Betterleaks" in 2026 as a successor built for the agentic era + +### 1.3 detect-secrets (Yelp) +- **GitHub:** https://github.com/Yelp/detect-secrets +- **Stars:** ~4,300 +- **Language:** Python +- **Plugins:** 27 built-in detectors +- **Approach:** Baseline methodology — tracks known secrets and flags new ones +- **Detection methods:** + - Regex-based plugins (structured secrets) + - High entropy string detection (Base64, Hex) + - Keyword detection (variable name matching) + - Optional ML-based gibberish detector (v1.1+) +- **AI/LLM plugins confirmed:** + - `OpenAIDetector` plugin exists + - No dedicated Anthropic, Cohere, Mistral, or Groq plugins +- **Key differentiator:** Baseline approach — only flags NEW secrets, not historical ones; enterprise-friendly +- **Limitations:** + - Minimal LLM provider coverage + - No active verification + - Fewer patterns than TruffleHog or Gitleaks + - Python-only (slower than Go/Rust alternatives) + +### 1.4 Nosey Parker (Praetorian) +- **GitHub:** https://github.com/praetorian-inc/noseyparker +- **Stars:** ~2,300 +- **Language:** Rust +- **Rules:** 188 high-precision regex rules +- **Approach:** Hybrid regex + ML denoising +- **Detection methods:** + - 188 tested regex rules tuned for low false positives + - ML model for false positive reduction (10-1000x improvement) + - Deduplication/grouping of findings +- **Performance:** GB/s scanning speeds, tested on 20TB+ datasets +- **Key differentiator:** ML-enhanced denoising, extreme performance +- **Status:** RETIRED — replaced by Titus (https://github.com/praetorian-inc/titus) +- **Limitations:** + - No specific LLM provider rules documented + - No active verification + - Project discontinued + +### 1.5 GitGuardian +- **Website:** https://www.gitguardian.com +- **Type:** Commercial + free tier for public repos +- **Detectors:** 450+ secret types +- **Approach:** Regex + AI-powered false positive reduction +- **Detection methods:** + - Specific prefix-based detectors + - Fine-tuned code-LLM for false positive filtering + - Validity checking for supported detectors +- **AI/LLM coverage:** + - Groq API Key (prefixed, with validity check) + - OpenAI, Anthropic, HuggingFace (confirmed) + - AI-related leaked secrets up 81% YoY in 2025 + - 1,275,105 leaked AI service secrets detected in 2025 +- **Key differentiator:** AI-powered false positive reduction, massive scale (scans all public GitHub) +- **Limitations:** + - Commercial/proprietary for private repos + - Regex patterns not publicly disclosed + +### 1.6 GitHub Secret Scanning (Native) +- **Type:** Built into GitHub +- **Approach:** Provider-partnered pattern matching + Copilot AI +- **AI/LLM patterns supported (with push protection and validity status):** + +| Provider | Pattern | Push Protection | Validity Check | +|----------|---------|:-:|:-:| +| Anthropic | `anthropic_admin_api_key` | Yes | Yes | +| Anthropic | `anthropic_api_key` | Yes | Yes | +| Anthropic | `anthropic_session_id` | Yes | No | +| Cohere | `cohere_api_key` | Yes | No | +| DeepSeek | `deepseek_api_key` | No | Yes | +| Google | `google_gemini_api_key` | No | No | +| Groq | `groq_api_key` | Yes | Yes | +| Hugging Face | `hf_org_api_key` | Yes | No | +| Hugging Face | `hf_user_access_token` | Yes | Yes | +| Mistral AI | `mistral_ai_api_key` | No | No | +| OpenAI | `openai_api_key` | Yes | Yes | +| Replicate | `replicate_api_token` | Yes | Yes | +| xAI | `xai_api_key` | Yes | Yes | +| Azure | `azure_openai_key` | Yes | No | + +- **Recent developments (March 2026):** + - Added 37 new secret detectors including Langchain + - Extended scanning to AI coding agents via MCP + - Copilot uses GPT-3.5-Turbo + GPT-4 for unstructured secret detection (94% FP reduction) + - Base64-encoded secret detection with push protection + +### 1.7 Other Notable Tools + +| Tool | Stars | Language | Patterns | Key Feature | +|------|-------|----------|----------|-------------| +| **KeyHacks** (streaak) | 6,100 | Markdown/Shell | 100+ services | Validation curl commands for bug bounty | +| **keyhacks.sh** (gwen001) | ~500 | Bash | 50+ | Automated version of KeyHacks | +| **Secrets Patterns DB** (mazen160) | 1,400 | YAML/Regex | 1,600+ | Largest open-source regex DB, exports to TruffleHog/Gitleaks format | +| **secret-regex-list** (h33tlit) | ~1,000 | Regex | 100+ | Regex patterns for scraping secrets | +| **regextokens** (odomojuli) | ~300 | Regex | 50+ | OAuth/API token regex patterns | +| **Betterleaks** | New (2026) | Go | — | Gitleaks successor for agentic era | + +--- + +## 2. LLM-Specific API Key Tools + +### 2.1 Dedicated LLM Key Validators + +| Tool | URL | Providers | Approach | +|------|-----|-----------|----------| +| **TestMyAPIKey.com** | testmyapikey.com | OpenAI, Anthropic Claude, + 13 others | Client-side regex + live API validation | +| **SecurityWall Checker** | securitywall.co/tools/api-key-checker | 455+ patterns, 350+ services (incl. OpenAI, Anthropic) | Client-side regex, generates curl commands | +| **VibeFactory Scanner** | vibefactory.ai/api-key-security-scanner | 150+ types (incl. OpenAI) | Scans deployed websites for exposed keys | +| **KeyLeak Detector** | github.com/Amal-David/keyleak-detector | Multiple | Headless browser + network interception | +| **OpenAI Key Tester** | trevorfox.com/api-key-tester/openai | OpenAI, Anthropic | Direct API validation | +| **Chatbot API Tester** | apikeytester.netlify.app | OpenAI, DeepSeek, OpenRouter | Endpoint validation | +| **SecurityToolkits** | securitytoolkits.com/tools/apikey-validator | Multiple | API key/token checker | + +### 2.2 LLM Gateways with Key Validation + +These tools validate keys as part of their proxy/gateway functionality: + +| Tool | Stars | Providers | Validation Approach | +|------|-------|-----------|---------------------| +| **LiteLLM** | ~18k | 107 providers | AuthenticationError mapping from all providers | +| **OpenRouter** | — | 60+ providers, 500+ models | Unified API key, provider-level validation | +| **Portkey AI** | ~5k | 30+ providers | AI gateway with key validation | +| **LLM-API-Key-Proxy** | ~200 | OpenAI, Anthropic compatible | Self-hosted proxy with key validation | + +### 2.3 Key Gap: No Comprehensive LLM-Focused Scanner + +**Critical finding:** There is NO dedicated open-source tool that: +1. Detects API keys from all major LLM providers (50+) +2. Validates them against live APIs +3. Reports provider, model access, rate limits, and spend +4. Covers both legacy and new key formats + +The closest tools are: +- TruffleHog (broadest verification, but only ~3 confirmed LLM detectors) +- GitHub Secret Scanning (14 AI-related patterns, but GitHub-only) +- GitGuardian (broad AI coverage, but commercial) + +--- + +## 3. Top LLM API Providers + +### Tier 1: Major Cloud & Frontier Model Providers +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 1 | **OpenAI** | GPT-5, GPT-4o, o-series | Market leader | +| 2 | **Anthropic** | Claude Opus 4, Sonnet, Haiku | Enterprise focus | +| 3 | **Google (Gemini/Vertex AI)** | Gemini 2.5 Pro/Flash | 2M token context | +| 4 | **AWS Bedrock** | Multi-model (Claude, Llama, etc.) | AWS ecosystem | +| 5 | **Azure OpenAI** | GPT-4o, o-series | Enterprise SLA 99.9% | +| 6 | **Google AI Studio** | Gemini API | Developer-friendly | +| 7 | **xAI** | Grok 4.1 | 2M context, low cost | + +### Tier 2: Specialized & Competitive Providers +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 8 | **Mistral AI** | Mistral Large, Codestral | European, open-weight | +| 9 | **Cohere** | Command R+ | Enterprise RAG focus | +| 10 | **DeepSeek** | DeepSeek R1, V3 | Ultra-low cost reasoning | +| 11 | **Perplexity** | Sonar Pro | Search-augmented LLM | +| 12 | **Together AI** | 200+ open-source models | Low latency inference | +| 13 | **Groq** | LPU inference | Fastest inference speeds | +| 14 | **Fireworks AI** | Open-source model hosting | Sub-100ms latency | +| 15 | **Replicate** | Model hosting platform | Pay-per-use | +| 16 | **Cerebras** | Wafer-scale inference | Ultra-fast inference | +| 17 | **SambaNova** | Enterprise inference | Custom silicon | +| 18 | **AI21** | Jamba models | Long context | +| 19 | **Stability AI** | Stable Diffusion, text models | Image + text | +| 20 | **NVIDIA NIM** | Optimized model serving | GPU-optimized | + +### Tier 3: Infrastructure, Platform & Gateway Providers +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 21 | **Cloudflare Workers AI** | Edge inference | Edge computing | +| 22 | **Vercel AI** | AI SDK, v0 | Frontend-focused | +| 23 | **OpenRouter** | Multi-model gateway | 500+ models | +| 24 | **HuggingFace** | Inference API, 300+ models | Open-source hub | +| 25 | **DeepInfra** | Inference platform | Cost-effective | +| 26 | **Novita AI** | 200+ production APIs | Multi-modal | +| 27 | **Baseten** | Model serving | Custom deployments | +| 28 | **Anyscale** | Ray-based inference | Scalable | +| 29 | **Lambda AI** | GPU cloud + inference | | +| 30 | **OctoAI** | Optimized inference | | +| 31 | **Databricks** | DBRX, model serving | Data + AI | +| 32 | **Snowflake** | Cortex AI | Data warehouse + AI | +| 33 | **Oracle OCI** | OCI AI | Enterprise | +| 34 | **SAP Generative AI Hub** | Enterprise AI | SAP ecosystem | +| 35 | **IBM WatsonX** | Granite models | Enterprise | + +### Tier 4: Chinese & Regional Providers +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 36 | **Alibaba (Qwen/Dashscope)** | Qwen 2.5/3 series | Top Chinese open-source | +| 37 | **Baidu (Wenxin/ERNIE)** | ERNIE 4.0 | Chinese market leader | +| 38 | **ByteDance (Doubao)** | Doubao/Kimi | TikTok parent | +| 39 | **Zhipu AI** | GLM-4.5 | ChatGLM lineage | +| 40 | **Baichuan** | Baichuan 4 | Domain-specific (law, finance) | +| 41 | **Moonshot AI (Kimi)** | Kimi K1.5/K2 | 128K context | +| 42 | **01.AI (Yi)** | Yi-Large, Yi-34B | Founded by Kai-Fu Lee | +| 43 | **MiniMax** | MiniMax models | Chinese AI tiger | +| 44 | **StepFun** | Step models | Chinese AI tiger | +| 45 | **Tencent (Hunyuan)** | Hunyuan models | WeChat ecosystem | +| 46 | **iFlyTek (Spark)** | Spark models | Voice/NLP specialist | +| 47 | **SenseNova (SenseTime)** | SenseNova models | Vision + language | +| 48 | **Volcano Engine (ByteDance)** | Cloud AI services | ByteDance cloud | +| 49 | **Nebius AI** | Inference platform | Yandex spinoff | + +### Tier 5: Emerging, Niche & Specialized Providers +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 50 | **Aleph Alpha** | Luminous models | EU-focused, compliance | +| 51 | **Comet API** | ML experiment tracking | | +| 52 | **Writer** | Palmyra models | Enterprise content | +| 53 | **Reka AI** | Reka Core/Flash | Multimodal | +| 54 | **Upstage** | Solar models | Korean provider | +| 55 | **FriendliAI** | Inference optimization | | +| 56 | **Forefront AI** | Model hosting | | +| 57 | **GooseAI** | GPT-NeoX hosting | Low cost | +| 58 | **NLP Cloud** | Model hosting | | +| 59 | **Predibase** | Fine-tuning platform | LoRA specialist | +| 60 | **Clarifai** | Vision + LLM | | +| 61 | **AiLAYER** | AI platform | | +| 62 | **AIMLAPI** | Multi-model API | | +| 63 | **Corcel** | Decentralized inference | Bittensor-based | +| 64 | **HyperBee AI** | AI platform | | +| 65 | **Lamini** | Fine-tuning + inference | | +| 66 | **Monster API** | GPU inference | | +| 67 | **Neets.ai** | TTS + LLM | | +| 68 | **Featherless AI** | Inference | | +| 69 | **Hyperbolic** | Inference platform | | +| 70 | **Inference.net** | Open-source inference | | +| 71 | **Galadriel** | Decentralized AI | | +| 72 | **PublicAI** | Community inference | | +| 73 | **Bytez** | Model hosting | | +| 74 | **Chutes** | Inference | | +| 75 | **GMI Cloud** | GPU cloud + inference | | +| 76 | **Nscale** | Inference platform | | +| 77 | **Scaleway** | European cloud AI | | +| 78 | **OVHCloud AI** | European cloud AI | | +| 79 | **Heroku AI** | PaaS AI add-on | | +| 80 | **Sarvam.ai** | Indian AI models | | + +### Tier 6: Self-Hosted & Local Inference +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 81 | **Ollama** | Local LLM runner | No API key needed | +| 82 | **LM Studio** | Desktop LLM | No API key needed | +| 83 | **vLLM** | Inference engine | Self-hosted | +| 84 | **Llamafile** | Single-file LLM | Self-hosted | +| 85 | **Xinference** | Inference platform | Self-hosted | +| 86 | **Triton Inference Server** | NVIDIA serving | Self-hosted | +| 87 | **LlamaGate** | Gateway | Self-hosted | +| 88 | **Docker Model Runner** | Container inference | Self-hosted | + +### Tier 7: Aggregators, Gateways & Middleware +| # | Provider | Key Product | Notes | +|---|----------|-------------|-------| +| 89 | **LiteLLM** | AI gateway (107 providers) | Open-source | +| 90 | **Portkey** | AI gateway | Observability | +| 91 | **Helicone** | LLM observability | Proxy-based | +| 92 | **Bifrost** | AI gateway (Go) | Fastest gateway | +| 93 | **Kong AI Gateway** | API management | Enterprise | +| 94 | **Vercel AI Gateway** | Edge AI | | +| 95 | **Cloudflare AI Gateway** | Edge AI | | +| 96 | **Agenta** | LLM ops platform | | +| 97 | **Straico** | Multi-model | | +| 98 | **AI302** | Gateway | | +| 99 | **AIHubMix** | Gateway | | +| 100 | **Zenmux** | Gateway | | +| 101 | **Poe** | Multi-model chat | Quora | +| 102 | **Gitee AI** | Chinese GitHub AI | | +| 103 | **GitHub Models** | GitHub-hosted inference | | +| 104 | **GitHub Copilot** | Code completion | | +| 105 | **ModelScope** | Chinese model hub | Alibaba | +| 106 | **Voyage AI** | Embeddings | | +| 107 | **Jina AI** | Embeddings + search | | +| 108 | **Deepgram** | Speech-to-text | | +| 109 | **ElevenLabs** | Text-to-speech | | +| 110 | **Black Forest Labs** | Image generation (FLUX) | | +| 111 | **Fal AI** | Image/video generation | | +| 112 | **RunwayML** | Video generation | | +| 113 | **Recraft** | Image generation | | +| 114 | **DataRobot** | ML platform | | +| 115 | **Weights & Biases** | ML ops + inference | | +| 116 | **CompactifAI** | Model compression | | +| 117 | **GradientAI** | Fine-tuning | | +| 118 | **Topaz** | AI platform | | +| 119 | **Synthetic** | Data generation | | +| 120 | **Infiniai** | Inference | | +| 121 | **Higress** | AI gateway | Alibaba | +| 122 | **PPIO** | Inference | | +| 123 | **Qiniu** | Chinese cloud AI | | +| 124 | **NanoGPT** | Lightweight inference | | +| 125 | **Morph** | AI platform | | +| 126 | **Milvus** | Vector DB + AI | | +| 127 | **XiaoMi MiMo** | Xiaomi AI | | +| 128 | **Petals** | Distributed inference | | +| 129 | **ZeroOne** | AI platform | | +| 130 | **Lemonade** | AI platform | | +| 131 | **Taichu** | Chinese AI | | +| 132 | **Amazon Nova** | AWS native models | | + +--- + +## 4. API Key Patterns by Provider + +### 4.1 Confirmed Key Prefixes & Formats + +| Provider | Prefix | Regex Pattern | Confidence | +|----------|--------|---------------|------------| +| **OpenAI (legacy)** | `sk-` | `sk-[a-zA-Z0-9]{48}` | High | +| **OpenAI (project)** | `sk-proj-` | `sk-proj-[a-zA-Z0-9_-]{80,}` | High | +| **OpenAI (service account)** | `sk-svcacct-` | `sk-svcacct-[a-zA-Z0-9_-]{80,}` | High | +| **OpenAI (legacy user)** | `sk-None-` | `sk-None-[a-zA-Z0-9_-]{80,}` | High | +| **Anthropic (API)** | `sk-ant-api03-` | `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA` | High | +| **Anthropic (Admin)** | `sk-ant-admin01-` | `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA` | High | +| **Google AI / Gemini** | `AIza` | `AIza[0-9A-Za-z\-_]{35}` | High | +| **HuggingFace (user)** | `hf_` | `hf_[a-zA-Z]{34}` | High | +| **HuggingFace (org)** | `api_org_` | `api_org_[a-zA-Z]{34}` | High | +| **Groq** | `gsk_` | `gsk_[a-zA-Z0-9]{48,}` | High | +| **Replicate** | `r8_` | `r8_[a-zA-Z0-9]{40}` | High | +| **Fireworks AI** | `fw_` | `fw_[a-zA-Z0-9_-]{40,}` | Medium | +| **Perplexity** | `pplx-` | `pplx-[a-zA-Z0-9]{48}` | High | +| **AWS (general)** | `AKIA` | `AKIA[0-9A-Z]{16}` | High | +| **GitHub PAT** | `ghp_` | `ghp_[a-zA-Z0-9]{36}` | High | +| **Stripe (secret)** | `sk_live_` | `sk_live_[0-9a-zA-Z]{24}` | High | + +### 4.2 Providers with No Known Distinct Prefix + +These providers use generic-looking API keys without distinguishing prefixes, making detection harder: + +| Provider | Key Format | Detection Approach | +|----------|-----------|-------------------| +| **Mistral AI** | Generic alphanumeric | Keyword-based (`MISTRAL_API_KEY`) | +| **Cohere** | Generic alphanumeric | Keyword-based (`COHERE_API_KEY`, `CO_API_KEY`) | +| **Together AI** | Generic alphanumeric | Keyword-based | +| **DeepSeek** | `sk-` prefix (same as OpenAI legacy) | Keyword context needed | +| **Azure OpenAI** | 32-char hex | Keyword-based | +| **Stability AI** | `sk-` prefix | Keyword context needed | +| **AI21** | Generic alphanumeric | Keyword-based | +| **Cerebras** | Generic alphanumeric | Keyword-based | +| **SambaNova** | Generic alphanumeric | Keyword-based | + +### 4.3 Detection Difficulty Tiers + +**Easy (unique prefix):** OpenAI (sk-proj-, sk-svcacct-), Anthropic (sk-ant-), HuggingFace (hf_), Groq (gsk_), Replicate (r8_), Perplexity (pplx-), AWS (AKIA) + +**Medium (shared or short prefix):** OpenAI legacy (sk-), DeepSeek (sk-), Stability (sk-), Fireworks (fw_), Google (AIza) + +**Hard (no prefix, keyword-only):** Mistral, Cohere, Together AI, Azure OpenAI, AI21, Cerebras, most Chinese providers + +--- + +## 5. Key Validation Approaches + +### 5.1 Common Validation Endpoints + +| Provider | Validation Method | Endpoint | Cost | +|----------|-------------------|----------|------| +| **OpenAI** | List models | `GET /v1/models` | Free (no tokens consumed) | +| **Anthropic** | Send minimal message | `POST /v1/messages` (tiny prompt) | Minimal cost (~1 token) | +| **Google Gemini** | List models | `GET /v1/models` | Free | +| **Cohere** | Token check | `POST /v1/tokenize` or `/v1/generate` | Minimal | +| **HuggingFace** | Whoami | `GET /api/whoami` | Free | +| **Groq** | List models | `GET /v1/models` | Free | +| **Replicate** | Get account | `GET /v1/account` | Free | +| **Mistral** | List models | `GET /v1/models` | Free | +| **AWS** | STS GetCallerIdentity | `POST sts.amazonaws.com` | Free | +| **Azure OpenAI** | List deployments | `GET /openai/deployments` | Free | + +### 5.2 Validation Strategy Patterns + +1. **Passive detection (regex only):** Fastest, highest false positive rate. Used by Gitleaks, detect-secrets baseline mode. + +2. **Passive + entropy:** Combines regex with entropy scoring. Reduces false positives for generic patterns. Used by detect-secrets with entropy plugins. + +3. **Active verification (API call):** Makes lightweight API call to confirm key is live. Used by TruffleHog, GitHub secret scanning. Eliminates false positives but requires network access. + +4. **Deep analysis (permission enumeration):** Beyond verification, enumerates what the key can access. Used by TruffleHog for ~20 credential types. Most actionable but slowest. + +### 5.3 How Existing Tools Validate + +| Tool | Passive | Entropy | Active Verification | Permission Analysis | +|------|:-------:|:-------:|:-------------------:|:-------------------:| +| TruffleHog | Yes | No | Yes (800+ detectors) | Yes (~20 types) | +| Gitleaks | Yes | Optional | No | No | +| detect-secrets | Yes | Yes | Limited | No | +| Nosey Parker | Yes | ML-based | No | No | +| GitGuardian | Yes | Yes | Yes (selected) | Limited | +| GitHub Scanning | Yes | AI-based | Yes (selected) | No | +| SecurityWall | Yes | No | Generates curl cmds | No | +| KeyHacks | No | No | Manual curl cmds | Limited | + +--- + +## 6. Market Gaps & Opportunities + +### 6.1 Underserved Areas + +1. **LLM-specific comprehensive scanner:** No tool covers all 50+ LLM API providers with both detection and validation. + +2. **New key format coverage:** OpenAI's `sk-proj-` and `sk-svcacct-` formats are recent; many scanners only detect legacy `sk-` format. Gitleaks only added these in late 2025 via PR #1780. + +3. **Chinese/regional provider detection:** Almost zero coverage for Qwen, Baichuan, Zhipu, Moonshot, Yi, ERNIE, Doubao API keys in any scanner. + +4. **Key metadata extraction:** No tool extracts org, project, rate limits, or spend from detected LLM keys. + +5. **Agentic AI context:** With AI agents increasingly using API keys, there's a growing need for scanners that understand multi-key configurations (e.g., an agent with OpenAI + Anthropic + Serp API keys). + +6. **Vibe coding exposure:** VibeFactory's scanner addresses the problem of API keys exposed in frontend JavaScript by vibe-coded apps, but this is still nascent. + +### 6.2 Scale of the Problem + +- **28 million credentials leaked on GitHub in 2025** (Snyk) +- **1,275,105 leaked AI service secrets in 2025** (GitGuardian), up 81% YoY +- **8 of 10 fastest-growing leaked secret categories are AI-related** (GitGuardian) +- Fastest growing: Brave Search API (+1,255%), Firecrawl (+796%), Supabase (+992%) +- AI keys are found at **42.28 per million commits** for Groq alone (GitGuardian) + +### 6.3 Competitive Landscape Summary + +``` + Verification Depth + | + TruffleHog | ████████████████ (800+ detectors, deep analysis) + GitGuardian | ████████████ (450+ detectors, commercial) + GitHub | ██████████ (AI-powered, platform-locked) + Gitleaks | ████ (150+ regex, no verification) + detect-sec | ███ (27 plugins, baseline approach) + NoseyParker | ██ (188 rules, ML denoising, retired) + | + +------ LLM Provider Coverage ------> + + None of these tools provide >15 LLM provider detectors. + The market opportunity is a scanner focused on 50-100+ LLM providers + with active verification, permission analysis, and cost estimation. +``` + +--- + +## Sources + +### Open-Source Scanner Tools +- [TruffleHog - GitHub](https://github.com/trufflesecurity/trufflehog) +- [TruffleHog Detectors](https://trufflesecurity.com/detectors) +- [Gitleaks - GitHub](https://github.com/gitleaks/gitleaks) +- [Gitleaks Config (gitleaks.toml)](https://github.com/gitleaks/gitleaks/blob/master/config/gitleaks.toml) +- [detect-secrets - GitHub](https://github.com/Yelp/detect-secrets) +- [Nosey Parker - GitHub](https://github.com/praetorian-inc/noseyparker) +- [KeyHacks - GitHub](https://github.com/streaak/keyhacks) +- [Secrets Patterns DB - GitHub](https://github.com/mazen160/secrets-patterns-db) +- [regextokens - GitHub](https://github.com/odomojuli/regextokens) +- [Betterleaks - Gitleaks Successor](https://www.aikido.dev/blog/betterleaks-gitleaks-successor) + +### Comparison & Analysis +- [TruffleHog vs Gitleaks Comparison (Jit)](https://www.jit.io/resources/appsec-tools/trufflehog-vs-gitleaks-a-detailed-comparison-of-secret-scanning-tools) +- [Best Secret Scanning Tools 2025 (Aikido)](https://www.aikido.dev/blog/top-secret-scanning-tools) +- [8 Best Secret Scanning Tools 2026 (AppSec Santa)](https://appsecsanta.com/sast-tools/secret-scanning-tools) +- [Secret Scanning Tools 2026 (GitGuardian)](https://blog.gitguardian.com/secret-scanning-tools/) + +### API Key Patterns & Validation +- [OpenAI API Key Format Discussion](https://community.openai.com/t/regex-s-to-validate-api-key-and-org-id-format/44619) +- [OpenAI sk-proj Key Format](https://community.openai.com/t/how-to-create-an-api-secret-key-with-prefix-sk-only-always-creates-sk-proj-keys/1263531) +- [Gitleaks OpenAI Regex PR #1780](https://github.com/gitleaks/gitleaks/pull/1780) +- [GitHub Leaked API Keys Patterns](https://gist.github.com/win3zz/0a1c70589fcbea64dba4588b93095855) +- [GitGuardian Groq API Key Detector](https://docs.gitguardian.com/secrets-detection/secrets-detection-engine/detectors/specifics/groq_api_key) + +### LLM Key Validation Tools +- [TestMyAPIKey.com](https://www.testmyapikey.com/) +- [SecurityWall API Key Checker](https://securitywall.co/tools/api-key-checker) +- [VibeFactory API Key Scanner](https://vibefactory.ai/api-key-security-scanner) +- [KeyLeak Detector - GitHub](https://github.com/Amal-David/keyleak-detector) + +### LLM Provider Lists +- [LiteLLM Providers (107)](https://docs.litellm.ai/docs/providers) +- [Langbase Supported Providers](https://langbase.com/docs/supported-models-and-providers) +- [LLM-Interface API Keys Doc](https://github.com/samestrin/llm-interface/blob/main/docs/api-keys.md) +- [Artificial Analysis Provider Leaderboard](https://artificialanalysis.ai/leaderboards/providers) +- [Top LLM API Providers 2026 (Future AGI)](https://futureagi.substack.com/p/top-11-llm-api-providers-in-2026) + +### GitHub Secret Scanning +- [GitHub Supported Secret Scanning Patterns](https://docs.github.com/en/code-security/secret-scanning/introduction/supported-secret-scanning-patterns) +- [GitHub Adds 37 New Detectors (March 2026)](https://devops.com/github-adds-37-new-secret-detectors-in-march-extends-scanning-to-ai-coding-agents/) +- [GitHub Secret Scanning Coverage Update](https://github.blog/changelog/2026-03-31-github-secret-scanning-nine-new-types-and-more/) + +### Market Data +- [State of Secrets Sprawl 2026 (GitGuardian/Hacker News)](https://thehackernews.com/2026/03/the-state-of-secrets-sprawl-2026-9.html) +- [Why 28M Credentials Leaked on GitHub in 2025 (Snyk)](https://snyk.io/articles/state-of-secrets/) +- [GitGuardian AI Security](https://www.gitguardian.com/agentic-ai-security) diff --git a/docs/superpowers/specs/2026-04-04-keyhunter-design.md b/docs/superpowers/specs/2026-04-04-keyhunter-design.md new file mode 100644 index 0000000..392a9e1 --- /dev/null +++ b/docs/superpowers/specs/2026-04-04-keyhunter-design.md @@ -0,0 +1,556 @@ +# KeyHunter - Design Specification + +## Overview + +KeyHunter is a comprehensive, modular API key scanner built in Go, focused on detecting and validating API keys from 100+ LLM/AI providers. It combines native scanning capabilities with external tool integration (TruffleHog, Gitleaks), OSINT/recon modules, a web dashboard, and Telegram bot notifications. + +## Architecture + +**Approach:** Plugin-based architecture. Core scanner engine with providers defined as YAML files (compile-time embedded). Single binary distribution. + +### Directory Structure + +``` +keyhunter/ +├── cmd/keyhunter/ # CLI entrypoint (cobra) +├── pkg/ +│ ├── engine/ # Core scanning engine +│ │ ├── scanner.go # Orchestrator - input alir, provider'lari calistirir +│ │ ├── matcher.go # Regex + entropy matching +│ │ └── verifier.go # Active key verification (--verify flag) +│ ├── provider/ # Provider registry & loader +│ │ ├── registry.go # Provider'lari yukler ve yonetir +│ │ ├── types.go # Provider interface tanimlari +│ │ └── builtin/ # Compile-time embedded provider YAML'lari +│ ├── input/ # Input source adapters +│ │ ├── file.go # Dosya/dizin tarama +│ │ ├── git.go # Git history/diff tarama +│ │ ├── stdin.go # Pipe/stdin destegi +│ │ ├── url.go # URL fetch +│ │ └── remote.go # GitHub/GitLab API, paste siteleri +│ ├── output/ # Output formatters +│ │ ├── table.go # Renkli terminal tablo +│ │ ├── json.go # JSON export +│ │ ├── sarif.go # SARIF (CI/CD uyumlu) +│ │ └── csv.go # CSV export +│ ├── adapter/ # External tool parsers +│ │ ├── trufflehog.go # TruffleHog JSON output parser +│ │ └── gitleaks.go # Gitleaks JSON output parser +│ ├── recon/ # OSINT/Recon engine (80+ sources) +│ │ ├── engine.go # Recon orchestrator +│ │ ├── ratelimit.go # Rate limiting & politeness +│ │ │ +│ │ │ # --- IoT & Internet Search Engines --- +│ │ ├── shodan.go # Shodan API client +│ │ ├── censys.go # Censys API client +│ │ ├── zoomeye.go # ZoomEye (Chinese IoT scanner) +│ │ ├── fofa.go # FOFA (Chinese IoT scanner) +│ │ ├── netlas.go # Netlas.io (HTTP body search) +│ │ ├── binaryedge.go # BinaryEdge scanner +│ │ │ +│ │ │ # --- Code Hosting & Snippets --- +│ │ ├── github.go # GitHub code search / dorks +│ │ ├── gitlab.go # GitLab search +│ │ ├── gist.go # GitHub Gist search +│ │ ├── bitbucket.go # Bitbucket code search +│ │ ├── codeberg.go # Codeberg/Gitea search +│ │ ├── gitea.go # Self-hosted Gitea instances +│ │ ├── replit.go # Replit public repls +│ │ ├── codesandbox.go # CodeSandbox projects +│ │ ├── stackblitz.go # StackBlitz projects +│ │ ├── codepen.go # CodePen pens +│ │ ├── jsfiddle.go # JSFiddle snippets +│ │ ├── glitch.go # Glitch public projects +│ │ ├── observable.go # Observable notebooks +│ │ ├── huggingface.go # HuggingFace Spaces/repos +│ │ ├── kaggle.go # Kaggle notebooks/datasets +│ │ ├── jupyter.go # nbviewer / Jupyter notebooks +│ │ ├── gitpod.go # Gitpod workspace snapshots +│ │ │ +│ │ │ # --- Search Engine Dorking --- +│ │ ├── google.go # Google Custom Search / SerpAPI dorking +│ │ ├── bing.go # Bing Web Search API dorking +│ │ ├── duckduckgo.go # DuckDuckGo search +│ │ ├── yandex.go # Yandex XML Search +│ │ ├── brave.go # Brave Search API +│ │ │ +│ │ │ # --- Paste Sites --- +│ │ ├── paste.go # Multi-paste aggregator (pastebin, dpaste, paste.ee, rentry, hastebin, ix.io, etc.) +│ │ │ +│ │ │ # --- Package Registries --- +│ │ ├── npm.go # npm registry scanning +│ │ ├── pypi.go # PyPI package scanning +│ │ ├── rubygems.go # RubyGems scanning +│ │ ├── crates.go # crates.io (Rust) +│ │ ├── maven.go # Maven Central (Java) +│ │ ├── nuget.go # NuGet (.NET) +│ │ ├── packagist.go # Packagist (PHP) +│ │ ├── goproxy.go # Go module proxy +│ │ │ +│ │ │ # --- Container & Infra --- +│ │ ├── docker.go # Docker Hub image/layer scanning +│ │ ├── kubernetes.go # Exposed K8s dashboards & configs +│ │ ├── terraform.go # Terraform state files & registry +│ │ ├── helm.go # Artifact Hub / Helm charts +│ │ ├── ansible.go # Ansible Galaxy collections +│ │ │ +│ │ │ # --- Cloud Storage --- +│ │ ├── s3.go # AWS S3 bucket enumeration +│ │ ├── gcs.go # Google Cloud Storage buckets +│ │ ├── azureblob.go # Azure Blob Storage +│ │ ├── spaces.go # DigitalOcean Spaces +│ │ ├── backblaze.go # Backblaze B2 +│ │ ├── minio.go # Self-hosted MinIO instances +│ │ ├── grayhat.go # GrayHatWarfare (bucket search engine) +│ │ │ +│ │ │ # --- CI/CD Log Leaks --- +│ │ ├── travisci.go # Travis CI public build logs +│ │ ├── circleci.go # CircleCI build logs +│ │ ├── ghactions.go # GitHub Actions workflow logs +│ │ ├── jenkins.go # Exposed Jenkins instances +│ │ ├── gitlabci.go # GitLab CI/CD pipeline logs +│ │ │ +│ │ │ # --- Web Archives --- +│ │ ├── wayback.go # Wayback Machine CDX API +│ │ ├── commoncrawl.go # CommonCrawl index & WARC +│ │ │ +│ │ │ # --- Forums & Documentation --- +│ │ ├── stackoverflow.go # Stack Overflow / Stack Exchange API +│ │ ├── reddit.go # Reddit search +│ │ ├── hackernews.go # HN Algolia API +│ │ ├── devto.go # dev.to articles +│ │ ├── medium.go # Medium articles +│ │ ├── telegram_recon.go # Telegram public channels +│ │ ├── discord.go # Discord indexed content +│ │ │ +│ │ │ # --- Collaboration Tools --- +│ │ ├── notion.go # Notion public pages +│ │ ├── confluence.go # Confluence public spaces +│ │ ├── trello.go # Trello public boards +│ │ ├── googledocs.go # Google Docs/Sheets public +│ │ │ +│ │ │ # --- Frontend & JS Leaks --- +│ │ ├── sourcemaps.go # JS source map extraction +│ │ ├── webpack.go # Webpack/Vite bundle scanning +│ │ ├── dotenv_web.go # Exposed .env files on web servers +│ │ ├── swagger.go # Exposed Swagger/OpenAPI docs +│ │ ├── deploys.go # Vercel/Netlify preview deployments +│ │ │ +│ │ │ # --- Log Aggregators --- +│ │ ├── elasticsearch.go # Exposed Elasticsearch/Kibana +│ │ ├── grafana.go # Exposed Grafana dashboards +│ │ ├── sentry.go # Exposed Sentry instances +│ │ │ +│ │ │ # --- Threat Intelligence --- +│ │ ├── virustotal.go # VirusTotal file/URL search +│ │ ├── intelx.go # Intelligence X aggregated search +│ │ ├── urlhaus.go # URLhaus abuse.ch +│ │ │ +│ │ │ # --- Mobile Apps --- +│ │ ├── apk.go # APK download & decompile scanning +│ │ │ +│ │ │ # --- DNS/Subdomain --- +│ │ ├── crtsh.go # Certificate Transparency (crt.sh) +│ │ ├── subdomain.go # Subdomain config endpoint probing +│ │ │ +│ │ │ # --- API Marketplaces --- +│ │ ├── postman.go # Postman public collections/workspaces +│ │ ├── swaggerhub.go # SwaggerHub published APIs +│ │ └── rapidapi.go # RapidAPI public endpoints +│ │ +│ ├── dorks/ # Dork management +│ │ ├── loader.go # YAML dork loader +│ │ ├── runner.go # Dork execution engine +│ │ └── builtin/ # Embedded dork YAML'lari +│ ├── notify/ # Notification modulleri +│ │ ├── telegram.go # Telegram bot +│ │ ├── webhook.go # Generic webhook +│ │ └── slack.go # Slack +│ └── web/ # Web dashboard +│ ├── server.go # Embedded HTTP server +│ ├── api.go # REST API +│ └── static/ # Frontend assets (htmx + tailwind) +├── providers/ # Provider YAML definitions (embed edilir) +│ ├── openai.yaml +│ ├── anthropic.yaml +│ └── ... (108 provider) +├── dorks/ # Dork YAML definitions (embed edilir) +│ ├── github.yaml # GitHub code search dorks +│ ├── gitlab.yaml # GitLab search dorks +│ ├── shodan.yaml # Shodan IoT dorks +│ ├── censys.yaml # Censys dorks +│ ├── zoomeye.yaml # ZoomEye dorks +│ ├── fofa.yaml # FOFA dorks +│ ├── google.yaml # Google dorking queries +│ ├── bing.yaml # Bing dorking queries +│ └── generic.yaml # Multi-source keyword dorks +├── configs/ # Ornek config dosyalari +└── docs/ +``` + +### Data Flow + +``` +Input Source -> Scanner Engine -> Provider Matcher -> (optional) Verifier -> Output Formatter + Notifier + -> SQLite DB (persist) + -> Web Dashboard (serve) +``` + +## Provider YAML Schema + +```yaml +id: string # Unique provider ID +name: string # Display name +category: enum # frontier | mid-tier | emerging | chinese | infrastructure | gateway | self-hosted +website: string # API base URL +confidence: enum # high | medium | low + +patterns: + - id: string # Unique pattern ID + name: string # Human-readable name + regex: string # Detection regex + confidence: enum # high | medium | low + description: string # Pattern description + +keywords: []string # Pre-filtering keywords (performance optimization) + +verify: + enabled: bool + method: string # HTTP method + url: string # Verification endpoint + headers: map # Headers with {{key}} template + success_codes: []int + failure_codes: []int + extract: # Additional info extraction on success + - field: string + path: string # JSON path + +metadata: + docs: string # API docs URL + key_url: string # Key management URL + env_vars: []string # Common environment variable names + revoke_url: string # Key revocation URL +``` + +## CLI Command Structure + +### Core Commands + +```bash +# Scanning +keyhunter scan path