merge: phase 14-03 frontend leaks
This commit is contained in:
1
.claude/worktrees/agent-a090b6ec
Submodule
1
.claude/worktrees/agent-a090b6ec
Submodule
Submodule .claude/worktrees/agent-a090b6ec added at a75d81a8d6
1
.claude/worktrees/agent-a11dddbd
Submodule
1
.claude/worktrees/agent-a11dddbd
Submodule
Submodule .claude/worktrees/agent-a11dddbd added at 8d97b263ec
1
.claude/worktrees/agent-a19eb2f7
Submodule
1
.claude/worktrees/agent-a19eb2f7
Submodule
Submodule .claude/worktrees/agent-a19eb2f7 added at d98513bf55
1
.claude/worktrees/agent-a1a93bb2
Submodule
1
.claude/worktrees/agent-a1a93bb2
Submodule
Submodule .claude/worktrees/agent-a1a93bb2 added at 6ab411cda2
Submodule .claude/worktrees/agent-a1ab7cd2/.claude/worktrees/agent-a30fab90/.claude/worktrees/agent-a3b639bf/.claude/worktrees/agent-a9511329/.claude/worktrees/agent-aed10f3e/.claude/worktrees/agent-a44a25be added at 0ff9edc6c1
1
.claude/worktrees/agent-a2637f83
Submodule
1
.claude/worktrees/agent-a2637f83
Submodule
Submodule .claude/worktrees/agent-a2637f83 added at 3d3c57fff2
1
.claude/worktrees/agent-a27c3406
Submodule
1
.claude/worktrees/agent-a27c3406
Submodule
Submodule .claude/worktrees/agent-a27c3406 added at 61a9d527ee
1
.claude/worktrees/agent-a2e54e09
Submodule
1
.claude/worktrees/agent-a2e54e09
Submodule
Submodule .claude/worktrees/agent-a2e54e09 added at d0396bb384
1
.claude/worktrees/agent-a2fe7ff3
Submodule
1
.claude/worktrees/agent-a2fe7ff3
Submodule
Submodule .claude/worktrees/agent-a2fe7ff3 added at 223c23e672
Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-a1113d5a added at 1013caf843
Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-ad901ba0 added at abfc2f8319
Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-adad8c10 added at 95ee768266
1
.claude/worktrees/agent-a5bf4f07
Submodule
1
.claude/worktrees/agent-a5bf4f07
Submodule
Submodule .claude/worktrees/agent-a5bf4f07 added at 43aeb8985d
1
.claude/worktrees/agent-a5d8d812
Submodule
1
.claude/worktrees/agent-a5d8d812
Submodule
Submodule .claude/worktrees/agent-a5d8d812 added at 6303308207
1
.claude/worktrees/agent-a6700ee2
Submodule
1
.claude/worktrees/agent-a6700ee2
Submodule
Submodule .claude/worktrees/agent-a6700ee2 added at d8a54f2c16
1
.claude/worktrees/agent-a7f84823
Submodule
1
.claude/worktrees/agent-a7f84823
Submodule
Submodule .claude/worktrees/agent-a7f84823 added at 21d5551aa4
1
.claude/worktrees/agent-abce7711
Submodule
1
.claude/worktrees/agent-abce7711
Submodule
Submodule .claude/worktrees/agent-abce7711 added at c595fef148
1
.claude/worktrees/agent-ac81d6ab
Submodule
1
.claude/worktrees/agent-ac81d6ab
Submodule
Submodule .claude/worktrees/agent-ac81d6ab added at cae714b488
1
.claude/worktrees/agent-ad7ef8d3
Submodule
1
.claude/worktrees/agent-ad7ef8d3
Submodule
Submodule .claude/worktrees/agent-ad7ef8d3 added at 792ac8d54b
Submodule .claude/worktrees/agent-ae6d1042/.claude/worktrees/agent-a0a11e9a added at a639cdea02
1
.claude/worktrees/agent-aefa9208
Submodule
1
.claude/worktrees/agent-aefa9208
Submodule
Submodule .claude/worktrees/agent-aefa9208 added at a2347f150a
@@ -173,11 +173,11 @@ Requirements for initial release. Each maps to roadmap phases.
|
||||
|
||||
### OSINT/Recon — Frontend & JS Leaks
|
||||
|
||||
- [ ] **RECON-JS-01**: JavaScript source map extraction and scanning
|
||||
- [ ] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars
|
||||
- [ ] **RECON-JS-03**: Exposed .env file scanning on web servers
|
||||
- [ ] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning
|
||||
- [ ] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning
|
||||
- [x] **RECON-JS-01**: JavaScript source map extraction and scanning
|
||||
- [x] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars
|
||||
- [x] **RECON-JS-03**: Exposed .env file scanning on web servers
|
||||
- [x] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning
|
||||
- [x] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning
|
||||
|
||||
### OSINT/Recon — Log Aggregators
|
||||
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
---
|
||||
phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks
|
||||
plan: 03
|
||||
subsystem: recon
|
||||
tags: [sourcemaps, webpack, dotenv, swagger, openapi, vercel, netlify, frontend-leaks]
|
||||
|
||||
requires:
|
||||
- phase: 10-osint-code-hosting
|
||||
provides: "ReconSource interface, Client, BuildQueries, LimiterRegistry patterns"
|
||||
- phase: 13-osint-package-registries
|
||||
provides: "RegisterAll with 40 sources baseline"
|
||||
provides:
|
||||
- "SourceMapSource for probing .map files for original source with API keys"
|
||||
- "WebpackSource for scanning JS bundles for inlined env vars"
|
||||
- "EnvLeakSource for detecting exposed .env files on web servers"
|
||||
- "SwaggerSource for finding API keys in OpenAPI example/default fields"
|
||||
- "DeployPreviewSource for scanning Vercel/Netlify previews for leaked env vars"
|
||||
- "RegisterAll extended to 45 sources"
|
||||
affects: [14-04, 14-05, 15, 16]
|
||||
|
||||
tech-stack:
|
||||
added: []
|
||||
patterns: ["Multi-path probing pattern for credentialless web asset scanning"]
|
||||
|
||||
key-files:
|
||||
created:
|
||||
- pkg/recon/sources/sourcemap.go
|
||||
- pkg/recon/sources/sourcemap_test.go
|
||||
- pkg/recon/sources/webpack.go
|
||||
- pkg/recon/sources/webpack_test.go
|
||||
- pkg/recon/sources/envleak.go
|
||||
- pkg/recon/sources/envleak_test.go
|
||||
- pkg/recon/sources/swagger.go
|
||||
- pkg/recon/sources/swagger_test.go
|
||||
- pkg/recon/sources/deploypreview.go
|
||||
- pkg/recon/sources/deploypreview_test.go
|
||||
modified:
|
||||
- pkg/recon/sources/register.go
|
||||
- pkg/recon/sources/register_test.go
|
||||
- pkg/recon/sources/integration_test.go
|
||||
|
||||
key-decisions:
|
||||
- "Multi-path probing: each source probes multiple common paths per query rather than single endpoint"
|
||||
- "Nil Limiters in tests: skip rate limiting in httptest to keep tests fast (<1s)"
|
||||
- "RegisterAll extended to 45 sources (40 Phase 10-13 + 5 Phase 14 frontend leak sources)"
|
||||
|
||||
patterns-established:
|
||||
- "Multi-path probing pattern: sources that probe multiple common URL paths per domain/query hint"
|
||||
- "Regex-based content scanning: compile-time regex patterns for detecting secrets in response bodies"
|
||||
|
||||
requirements-completed: [RECON-JS-01, RECON-JS-02, RECON-JS-03, RECON-JS-04, RECON-JS-05]
|
||||
|
||||
duration: 5min
|
||||
completed: 2026-04-06
|
||||
---
|
||||
|
||||
# Phase 14 Plan 03: Frontend Leak Sources Summary
|
||||
|
||||
**Five credentialless frontend leak scanners: source maps, webpack bundles, exposed .env files, Swagger docs, and deploy preview environments**
|
||||
|
||||
## Performance
|
||||
|
||||
- **Duration:** 5 min
|
||||
- **Started:** 2026-04-06T10:13:15Z
|
||||
- **Completed:** 2026-04-06T10:18:15Z
|
||||
- **Tasks:** 2
|
||||
- **Files modified:** 13
|
||||
|
||||
## Accomplishments
|
||||
- SourceMapSource probes 7 common .map paths, parses JSON sourcesContent for API key patterns
|
||||
- WebpackSource scans JS bundles for NEXT_PUBLIC_/REACT_APP_/VITE_ prefixed env var leaks
|
||||
- EnvLeakSource probes 8 common .env paths with multiline regex matching for secret key=value lines
|
||||
- SwaggerSource parses OpenAPI JSON docs for API keys in example/default fields
|
||||
- DeployPreviewSource scans Vercel/Netlify preview URLs for __NEXT_DATA__ and env var patterns
|
||||
- RegisterAll extended from 40 to 45 sources
|
||||
|
||||
## Task Commits
|
||||
|
||||
Each task was committed atomically:
|
||||
|
||||
1. **Task 1: SourceMapSource, WebpackSource, EnvLeakSource + tests** - `b57bd5e` (feat)
|
||||
2. **Task 2: SwaggerSource, DeployPreviewSource + tests** - `7d8a418` (feat)
|
||||
3. **RegisterAll wiring** - `0a8be81` (feat)
|
||||
|
||||
## Files Created/Modified
|
||||
- `pkg/recon/sources/sourcemap.go` - Source map file probing and content scanning
|
||||
- `pkg/recon/sources/sourcemap_test.go` - httptest-based tests for source map scanning
|
||||
- `pkg/recon/sources/webpack.go` - Webpack/Vite bundle env var detection
|
||||
- `pkg/recon/sources/webpack_test.go` - httptest-based tests for webpack scanning
|
||||
- `pkg/recon/sources/envleak.go` - Exposed .env file detection
|
||||
- `pkg/recon/sources/envleak_test.go` - httptest-based tests for .env scanning
|
||||
- `pkg/recon/sources/swagger.go` - Swagger/OpenAPI doc API key extraction
|
||||
- `pkg/recon/sources/swagger_test.go` - httptest-based tests for Swagger scanning
|
||||
- `pkg/recon/sources/deploypreview.go` - Vercel/Netlify deploy preview scanning
|
||||
- `pkg/recon/sources/deploypreview_test.go` - httptest-based tests for deploy preview scanning
|
||||
- `pkg/recon/sources/register.go` - Extended RegisterAll to 45 sources
|
||||
- `pkg/recon/sources/register_test.go` - Updated test expectations to 45
|
||||
- `pkg/recon/sources/integration_test.go` - Updated integration test count to 45
|
||||
|
||||
## Decisions Made
|
||||
- Multi-path probing: each source probes multiple common URL paths per query rather than constructing real domain URLs (sources are lead generators)
|
||||
- Nil Limiters in sweep tests: rate limiter adds 3s per path probe making tests take 20+ seconds; skip in unit tests, test rate limiting separately
|
||||
- envKeyValuePattern uses (?im) multiline flag for proper line-anchored matching in .env file content
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
### Auto-fixed Issues
|
||||
|
||||
**1. [Rule 1 - Bug] Fixed multiline regex in EnvLeakSource**
|
||||
- **Found during:** Task 1 (EnvLeakSource tests)
|
||||
- **Issue:** envKeyValuePattern used ^ anchor without (?m) multiline flag, failing to match lines in multi-line .env content
|
||||
- **Fix:** Added (?m) flag to regex: `(?im)^[A-Z_]*(API[_]?KEY|SECRET|...)`
|
||||
- **Files modified:** pkg/recon/sources/envleak.go
|
||||
- **Verification:** TestEnvLeak_Sweep_ExtractsFindings passes
|
||||
- **Committed in:** b57bd5e (Task 1 commit)
|
||||
|
||||
**2. [Rule 1 - Bug] Removed unused imports in sourcemap.go**
|
||||
- **Found during:** Task 1 (compilation)
|
||||
- **Issue:** "fmt" and "strings" imported but unused
|
||||
- **Fix:** Removed unused imports
|
||||
- **Files modified:** pkg/recon/sources/sourcemap.go
|
||||
- **Committed in:** b57bd5e (Task 1 commit)
|
||||
|
||||
**3. [Rule 2 - Missing Critical] Extended RegisterAll and updated integration tests**
|
||||
- **Found during:** After Task 2 (wiring sources)
|
||||
- **Issue:** New sources needed registration in RegisterAll; existing tests hardcoded 40 source count
|
||||
- **Fix:** Added 5 sources to RegisterAll, updated register_test.go and integration_test.go
|
||||
- **Files modified:** pkg/recon/sources/register.go, register_test.go, integration_test.go
|
||||
- **Committed in:** 0a8be81
|
||||
|
||||
---
|
||||
|
||||
**Total deviations:** 3 auto-fixed (2 bugs, 1 missing critical)
|
||||
**Impact on plan:** All fixes necessary for correctness. No scope creep.
|
||||
|
||||
## Issues Encountered
|
||||
None beyond the auto-fixed deviations above.
|
||||
|
||||
## User Setup Required
|
||||
None - all five sources are credentialless.
|
||||
|
||||
## Known Stubs
|
||||
None - all sources are fully implemented with real scanning logic.
|
||||
|
||||
## Next Phase Readiness
|
||||
- 45 sources now registered in RegisterAll
|
||||
- Frontend leak scanning vectors covered: source maps, webpack bundles, .env files, Swagger docs, deploy previews
|
||||
- Ready for remaining Phase 14 plans (CI/CD log sources, web archive sources)
|
||||
|
||||
---
|
||||
*Phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks*
|
||||
*Completed: 2026-04-06*
|
||||
548
RESEARCH_REPORT.md
Normal file
548
RESEARCH_REPORT.md
Normal file
@@ -0,0 +1,548 @@
|
||||
# API Key Scanner Market Research Report
|
||||
**Date: April 4, 2026**
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
1. [Existing Open-Source API Key Scanners](#1-existing-open-source-api-key-scanners)
|
||||
2. [LLM-Specific API Key Tools](#2-llm-specific-api-key-tools)
|
||||
3. [Top LLM API Providers (100+)](#3-top-llm-api-providers)
|
||||
4. [API Key Patterns by Provider](#4-api-key-patterns-by-provider)
|
||||
5. [Key Validation Approaches](#5-key-validation-approaches)
|
||||
6. [Market Gaps & Opportunities](#6-market-gaps--opportunities)
|
||||
|
||||
---
|
||||
|
||||
## 1. Existing Open-Source API Key Scanners
|
||||
|
||||
### 1.1 TruffleHog
|
||||
- **GitHub:** https://github.com/trufflesecurity/trufflehog
|
||||
- **Stars:** ~25,500
|
||||
- **Language:** Go
|
||||
- **Detectors:** 800+ secret types
|
||||
- **Approach:** Detector-based (each detector is a small Go program for a specific credential type)
|
||||
- **Detection methods:**
|
||||
- Pattern matching via dedicated detectors
|
||||
- Active verification against live APIs
|
||||
- Permission/scope analysis (~20 credential types)
|
||||
- **AI/LLM detectors confirmed:** OpenAI, OpenAI Admin Key, Anthropic
|
||||
- **Scanning sources:** Git repos, GitHub orgs, S3 buckets, GCS, Docker images, Jenkins, Elasticsearch, Postman, Slack, local filesystems
|
||||
- **Key differentiator:** Verification — not just "this looks like a key" but "this is an active key with these permissions"
|
||||
- **Limitations:**
|
||||
- Heavy/slow compared to regex-only scanners
|
||||
- Not all 800+ detectors have verification
|
||||
- LLM provider coverage still incomplete (no confirmed Cohere, Mistral, Groq detectors)
|
||||
|
||||
### 1.2 Gitleaks
|
||||
- **GitHub:** https://github.com/gitleaks/gitleaks
|
||||
- **Stars:** ~25,800
|
||||
- **Language:** Go
|
||||
- **Rules:** 150+ regex patterns in `gitleaks.toml`
|
||||
- **Approach:** Regex pattern matching with optional entropy checks
|
||||
- **Detection methods:**
|
||||
- Regex patterns defined in TOML config
|
||||
- Keyword matching
|
||||
- Entropy thresholds
|
||||
- Allowlists for false positive reduction
|
||||
- **AI/LLM rules confirmed:**
|
||||
- `anthropic-admin-api-key`: `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA`
|
||||
- `anthropic-api-key`: `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA`
|
||||
- `openai-api-key`: Updated to include `sk-proj-` and `sk-svcacct-` formats
|
||||
- `cohere-api-token`: Keyword-based detection
|
||||
- `huggingface-access-token`: `hf_[a-z]{34}`
|
||||
- `huggingface-organization-api-token`: `api_org_[a-z]{34}`
|
||||
- **Key differentiator:** Fast, simple, excellent as pre-commit hook
|
||||
- **Limitations:**
|
||||
- No active verification of detected keys
|
||||
- Regex-only means higher false positive rate for generic patterns
|
||||
- Limited LLM provider coverage beyond the 5 above
|
||||
- **Note:** Gitleaks creator launched "Betterleaks" in 2026 as a successor built for the agentic era
|
||||
|
||||
### 1.3 detect-secrets (Yelp)
|
||||
- **GitHub:** https://github.com/Yelp/detect-secrets
|
||||
- **Stars:** ~4,300
|
||||
- **Language:** Python
|
||||
- **Plugins:** 27 built-in detectors
|
||||
- **Approach:** Baseline methodology — tracks known secrets and flags new ones
|
||||
- **Detection methods:**
|
||||
- Regex-based plugins (structured secrets)
|
||||
- High entropy string detection (Base64, Hex)
|
||||
- Keyword detection (variable name matching)
|
||||
- Optional ML-based gibberish detector (v1.1+)
|
||||
- **AI/LLM plugins confirmed:**
|
||||
- `OpenAIDetector` plugin exists
|
||||
- No dedicated Anthropic, Cohere, Mistral, or Groq plugins
|
||||
- **Key differentiator:** Baseline approach — only flags NEW secrets, not historical ones; enterprise-friendly
|
||||
- **Limitations:**
|
||||
- Minimal LLM provider coverage
|
||||
- No active verification
|
||||
- Fewer patterns than TruffleHog or Gitleaks
|
||||
- Python-only (slower than Go/Rust alternatives)
|
||||
|
||||
### 1.4 Nosey Parker (Praetorian)
|
||||
- **GitHub:** https://github.com/praetorian-inc/noseyparker
|
||||
- **Stars:** ~2,300
|
||||
- **Language:** Rust
|
||||
- **Rules:** 188 high-precision regex rules
|
||||
- **Approach:** Hybrid regex + ML denoising
|
||||
- **Detection methods:**
|
||||
- 188 tested regex rules tuned for low false positives
|
||||
- ML model for false positive reduction (10-1000x improvement)
|
||||
- Deduplication/grouping of findings
|
||||
- **Performance:** GB/s scanning speeds, tested on 20TB+ datasets
|
||||
- **Key differentiator:** ML-enhanced denoising, extreme performance
|
||||
- **Status:** RETIRED — replaced by Titus (https://github.com/praetorian-inc/titus)
|
||||
- **Limitations:**
|
||||
- No specific LLM provider rules documented
|
||||
- No active verification
|
||||
- Project discontinued
|
||||
|
||||
### 1.5 GitGuardian
|
||||
- **Website:** https://www.gitguardian.com
|
||||
- **Type:** Commercial + free tier for public repos
|
||||
- **Detectors:** 450+ secret types
|
||||
- **Approach:** Regex + AI-powered false positive reduction
|
||||
- **Detection methods:**
|
||||
- Specific prefix-based detectors
|
||||
- Fine-tuned code-LLM for false positive filtering
|
||||
- Validity checking for supported detectors
|
||||
- **AI/LLM coverage:**
|
||||
- Groq API Key (prefixed, with validity check)
|
||||
- OpenAI, Anthropic, HuggingFace (confirmed)
|
||||
- AI-related leaked secrets up 81% YoY in 2025
|
||||
- 1,275,105 leaked AI service secrets detected in 2025
|
||||
- **Key differentiator:** AI-powered false positive reduction, massive scale (scans all public GitHub)
|
||||
- **Limitations:**
|
||||
- Commercial/proprietary for private repos
|
||||
- Regex patterns not publicly disclosed
|
||||
|
||||
### 1.6 GitHub Secret Scanning (Native)
|
||||
- **Type:** Built into GitHub
|
||||
- **Approach:** Provider-partnered pattern matching + Copilot AI
|
||||
- **AI/LLM patterns supported (with push protection and validity status):**
|
||||
|
||||
| Provider | Pattern | Push Protection | Validity Check |
|
||||
|----------|---------|:-:|:-:|
|
||||
| Anthropic | `anthropic_admin_api_key` | Yes | Yes |
|
||||
| Anthropic | `anthropic_api_key` | Yes | Yes |
|
||||
| Anthropic | `anthropic_session_id` | Yes | No |
|
||||
| Cohere | `cohere_api_key` | Yes | No |
|
||||
| DeepSeek | `deepseek_api_key` | No | Yes |
|
||||
| Google | `google_gemini_api_key` | No | No |
|
||||
| Groq | `groq_api_key` | Yes | Yes |
|
||||
| Hugging Face | `hf_org_api_key` | Yes | No |
|
||||
| Hugging Face | `hf_user_access_token` | Yes | Yes |
|
||||
| Mistral AI | `mistral_ai_api_key` | No | No |
|
||||
| OpenAI | `openai_api_key` | Yes | Yes |
|
||||
| Replicate | `replicate_api_token` | Yes | Yes |
|
||||
| xAI | `xai_api_key` | Yes | Yes |
|
||||
| Azure | `azure_openai_key` | Yes | No |
|
||||
|
||||
- **Recent developments (March 2026):**
|
||||
- Added 37 new secret detectors including Langchain
|
||||
- Extended scanning to AI coding agents via MCP
|
||||
- Copilot uses GPT-3.5-Turbo + GPT-4 for unstructured secret detection (94% FP reduction)
|
||||
- Base64-encoded secret detection with push protection
|
||||
|
||||
### 1.7 Other Notable Tools
|
||||
|
||||
| Tool | Stars | Language | Patterns | Key Feature |
|
||||
|------|-------|----------|----------|-------------|
|
||||
| **KeyHacks** (streaak) | 6,100 | Markdown/Shell | 100+ services | Validation curl commands for bug bounty |
|
||||
| **keyhacks.sh** (gwen001) | ~500 | Bash | 50+ | Automated version of KeyHacks |
|
||||
| **Secrets Patterns DB** (mazen160) | 1,400 | YAML/Regex | 1,600+ | Largest open-source regex DB, exports to TruffleHog/Gitleaks format |
|
||||
| **secret-regex-list** (h33tlit) | ~1,000 | Regex | 100+ | Regex patterns for scraping secrets |
|
||||
| **regextokens** (odomojuli) | ~300 | Regex | 50+ | OAuth/API token regex patterns |
|
||||
| **Betterleaks** | New (2026) | Go | — | Gitleaks successor for agentic era |
|
||||
|
||||
---
|
||||
|
||||
## 2. LLM-Specific API Key Tools
|
||||
|
||||
### 2.1 Dedicated LLM Key Validators
|
||||
|
||||
| Tool | URL | Providers | Approach |
|
||||
|------|-----|-----------|----------|
|
||||
| **TestMyAPIKey.com** | testmyapikey.com | OpenAI, Anthropic Claude, + 13 others | Client-side regex + live API validation |
|
||||
| **SecurityWall Checker** | securitywall.co/tools/api-key-checker | 455+ patterns, 350+ services (incl. OpenAI, Anthropic) | Client-side regex, generates curl commands |
|
||||
| **VibeFactory Scanner** | vibefactory.ai/api-key-security-scanner | 150+ types (incl. OpenAI) | Scans deployed websites for exposed keys |
|
||||
| **KeyLeak Detector** | github.com/Amal-David/keyleak-detector | Multiple | Headless browser + network interception |
|
||||
| **OpenAI Key Tester** | trevorfox.com/api-key-tester/openai | OpenAI, Anthropic | Direct API validation |
|
||||
| **Chatbot API Tester** | apikeytester.netlify.app | OpenAI, DeepSeek, OpenRouter | Endpoint validation |
|
||||
| **SecurityToolkits** | securitytoolkits.com/tools/apikey-validator | Multiple | API key/token checker |
|
||||
|
||||
### 2.2 LLM Gateways with Key Validation
|
||||
|
||||
These tools validate keys as part of their proxy/gateway functionality:
|
||||
|
||||
| Tool | Stars | Providers | Validation Approach |
|
||||
|------|-------|-----------|---------------------|
|
||||
| **LiteLLM** | ~18k | 107 providers | AuthenticationError mapping from all providers |
|
||||
| **OpenRouter** | — | 60+ providers, 500+ models | Unified API key, provider-level validation |
|
||||
| **Portkey AI** | ~5k | 30+ providers | AI gateway with key validation |
|
||||
| **LLM-API-Key-Proxy** | ~200 | OpenAI, Anthropic compatible | Self-hosted proxy with key validation |
|
||||
|
||||
### 2.3 Key Gap: No Comprehensive LLM-Focused Scanner
|
||||
|
||||
**Critical finding:** There is NO dedicated open-source tool that:
|
||||
1. Detects API keys from all major LLM providers (50+)
|
||||
2. Validates them against live APIs
|
||||
3. Reports provider, model access, rate limits, and spend
|
||||
4. Covers both legacy and new key formats
|
||||
|
||||
The closest tools are:
|
||||
- TruffleHog (broadest verification, but only ~3 confirmed LLM detectors)
|
||||
- GitHub Secret Scanning (14 AI-related patterns, but GitHub-only)
|
||||
- GitGuardian (broad AI coverage, but commercial)
|
||||
|
||||
---
|
||||
|
||||
## 3. Top LLM API Providers
|
||||
|
||||
### Tier 1: Major Cloud & Frontier Model Providers
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 1 | **OpenAI** | GPT-5, GPT-4o, o-series | Market leader |
|
||||
| 2 | **Anthropic** | Claude Opus 4, Sonnet, Haiku | Enterprise focus |
|
||||
| 3 | **Google (Gemini/Vertex AI)** | Gemini 2.5 Pro/Flash | 2M token context |
|
||||
| 4 | **AWS Bedrock** | Multi-model (Claude, Llama, etc.) | AWS ecosystem |
|
||||
| 5 | **Azure OpenAI** | GPT-4o, o-series | Enterprise SLA 99.9% |
|
||||
| 6 | **Google AI Studio** | Gemini API | Developer-friendly |
|
||||
| 7 | **xAI** | Grok 4.1 | 2M context, low cost |
|
||||
|
||||
### Tier 2: Specialized & Competitive Providers
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 8 | **Mistral AI** | Mistral Large, Codestral | European, open-weight |
|
||||
| 9 | **Cohere** | Command R+ | Enterprise RAG focus |
|
||||
| 10 | **DeepSeek** | DeepSeek R1, V3 | Ultra-low cost reasoning |
|
||||
| 11 | **Perplexity** | Sonar Pro | Search-augmented LLM |
|
||||
| 12 | **Together AI** | 200+ open-source models | Low latency inference |
|
||||
| 13 | **Groq** | LPU inference | Fastest inference speeds |
|
||||
| 14 | **Fireworks AI** | Open-source model hosting | Sub-100ms latency |
|
||||
| 15 | **Replicate** | Model hosting platform | Pay-per-use |
|
||||
| 16 | **Cerebras** | Wafer-scale inference | Ultra-fast inference |
|
||||
| 17 | **SambaNova** | Enterprise inference | Custom silicon |
|
||||
| 18 | **AI21** | Jamba models | Long context |
|
||||
| 19 | **Stability AI** | Stable Diffusion, text models | Image + text |
|
||||
| 20 | **NVIDIA NIM** | Optimized model serving | GPU-optimized |
|
||||
|
||||
### Tier 3: Infrastructure, Platform & Gateway Providers
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 21 | **Cloudflare Workers AI** | Edge inference | Edge computing |
|
||||
| 22 | **Vercel AI** | AI SDK, v0 | Frontend-focused |
|
||||
| 23 | **OpenRouter** | Multi-model gateway | 500+ models |
|
||||
| 24 | **HuggingFace** | Inference API, 300+ models | Open-source hub |
|
||||
| 25 | **DeepInfra** | Inference platform | Cost-effective |
|
||||
| 26 | **Novita AI** | 200+ production APIs | Multi-modal |
|
||||
| 27 | **Baseten** | Model serving | Custom deployments |
|
||||
| 28 | **Anyscale** | Ray-based inference | Scalable |
|
||||
| 29 | **Lambda AI** | GPU cloud + inference | |
|
||||
| 30 | **OctoAI** | Optimized inference | |
|
||||
| 31 | **Databricks** | DBRX, model serving | Data + AI |
|
||||
| 32 | **Snowflake** | Cortex AI | Data warehouse + AI |
|
||||
| 33 | **Oracle OCI** | OCI AI | Enterprise |
|
||||
| 34 | **SAP Generative AI Hub** | Enterprise AI | SAP ecosystem |
|
||||
| 35 | **IBM WatsonX** | Granite models | Enterprise |
|
||||
|
||||
### Tier 4: Chinese & Regional Providers
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 36 | **Alibaba (Qwen/Dashscope)** | Qwen 2.5/3 series | Top Chinese open-source |
|
||||
| 37 | **Baidu (Wenxin/ERNIE)** | ERNIE 4.0 | Chinese market leader |
|
||||
| 38 | **ByteDance (Doubao)** | Doubao/Kimi | TikTok parent |
|
||||
| 39 | **Zhipu AI** | GLM-4.5 | ChatGLM lineage |
|
||||
| 40 | **Baichuan** | Baichuan 4 | Domain-specific (law, finance) |
|
||||
| 41 | **Moonshot AI (Kimi)** | Kimi K1.5/K2 | 128K context |
|
||||
| 42 | **01.AI (Yi)** | Yi-Large, Yi-34B | Founded by Kai-Fu Lee |
|
||||
| 43 | **MiniMax** | MiniMax models | Chinese AI tiger |
|
||||
| 44 | **StepFun** | Step models | Chinese AI tiger |
|
||||
| 45 | **Tencent (Hunyuan)** | Hunyuan models | WeChat ecosystem |
|
||||
| 46 | **iFlyTek (Spark)** | Spark models | Voice/NLP specialist |
|
||||
| 47 | **SenseNova (SenseTime)** | SenseNova models | Vision + language |
|
||||
| 48 | **Volcano Engine (ByteDance)** | Cloud AI services | ByteDance cloud |
|
||||
| 49 | **Nebius AI** | Inference platform | Yandex spinoff |
|
||||
|
||||
### Tier 5: Emerging, Niche & Specialized Providers
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 50 | **Aleph Alpha** | Luminous models | EU-focused, compliance |
|
||||
| 51 | **Comet API** | ML experiment tracking | |
|
||||
| 52 | **Writer** | Palmyra models | Enterprise content |
|
||||
| 53 | **Reka AI** | Reka Core/Flash | Multimodal |
|
||||
| 54 | **Upstage** | Solar models | Korean provider |
|
||||
| 55 | **FriendliAI** | Inference optimization | |
|
||||
| 56 | **Forefront AI** | Model hosting | |
|
||||
| 57 | **GooseAI** | GPT-NeoX hosting | Low cost |
|
||||
| 58 | **NLP Cloud** | Model hosting | |
|
||||
| 59 | **Predibase** | Fine-tuning platform | LoRA specialist |
|
||||
| 60 | **Clarifai** | Vision + LLM | |
|
||||
| 61 | **AiLAYER** | AI platform | |
|
||||
| 62 | **AIMLAPI** | Multi-model API | |
|
||||
| 63 | **Corcel** | Decentralized inference | Bittensor-based |
|
||||
| 64 | **HyperBee AI** | AI platform | |
|
||||
| 65 | **Lamini** | Fine-tuning + inference | |
|
||||
| 66 | **Monster API** | GPU inference | |
|
||||
| 67 | **Neets.ai** | TTS + LLM | |
|
||||
| 68 | **Featherless AI** | Inference | |
|
||||
| 69 | **Hyperbolic** | Inference platform | |
|
||||
| 70 | **Inference.net** | Open-source inference | |
|
||||
| 71 | **Galadriel** | Decentralized AI | |
|
||||
| 72 | **PublicAI** | Community inference | |
|
||||
| 73 | **Bytez** | Model hosting | |
|
||||
| 74 | **Chutes** | Inference | |
|
||||
| 75 | **GMI Cloud** | GPU cloud + inference | |
|
||||
| 76 | **Nscale** | Inference platform | |
|
||||
| 77 | **Scaleway** | European cloud AI | |
|
||||
| 78 | **OVHCloud AI** | European cloud AI | |
|
||||
| 79 | **Heroku AI** | PaaS AI add-on | |
|
||||
| 80 | **Sarvam.ai** | Indian AI models | |
|
||||
|
||||
### Tier 6: Self-Hosted & Local Inference
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 81 | **Ollama** | Local LLM runner | No API key needed |
|
||||
| 82 | **LM Studio** | Desktop LLM | No API key needed |
|
||||
| 83 | **vLLM** | Inference engine | Self-hosted |
|
||||
| 84 | **Llamafile** | Single-file LLM | Self-hosted |
|
||||
| 85 | **Xinference** | Inference platform | Self-hosted |
|
||||
| 86 | **Triton Inference Server** | NVIDIA serving | Self-hosted |
|
||||
| 87 | **LlamaGate** | Gateway | Self-hosted |
|
||||
| 88 | **Docker Model Runner** | Container inference | Self-hosted |
|
||||
|
||||
### Tier 7: Aggregators, Gateways & Middleware
|
||||
| # | Provider | Key Product | Notes |
|
||||
|---|----------|-------------|-------|
|
||||
| 89 | **LiteLLM** | AI gateway (107 providers) | Open-source |
|
||||
| 90 | **Portkey** | AI gateway | Observability |
|
||||
| 91 | **Helicone** | LLM observability | Proxy-based |
|
||||
| 92 | **Bifrost** | AI gateway (Go) | Fastest gateway |
|
||||
| 93 | **Kong AI Gateway** | API management | Enterprise |
|
||||
| 94 | **Vercel AI Gateway** | Edge AI | |
|
||||
| 95 | **Cloudflare AI Gateway** | Edge AI | |
|
||||
| 96 | **Agenta** | LLM ops platform | |
|
||||
| 97 | **Straico** | Multi-model | |
|
||||
| 98 | **AI302** | Gateway | |
|
||||
| 99 | **AIHubMix** | Gateway | |
|
||||
| 100 | **Zenmux** | Gateway | |
|
||||
| 101 | **Poe** | Multi-model chat | Quora |
|
||||
| 102 | **Gitee AI** | Chinese GitHub AI | |
|
||||
| 103 | **GitHub Models** | GitHub-hosted inference | |
|
||||
| 104 | **GitHub Copilot** | Code completion | |
|
||||
| 105 | **ModelScope** | Chinese model hub | Alibaba |
|
||||
| 106 | **Voyage AI** | Embeddings | |
|
||||
| 107 | **Jina AI** | Embeddings + search | |
|
||||
| 108 | **Deepgram** | Speech-to-text | |
|
||||
| 109 | **ElevenLabs** | Text-to-speech | |
|
||||
| 110 | **Black Forest Labs** | Image generation (FLUX) | |
|
||||
| 111 | **Fal AI** | Image/video generation | |
|
||||
| 112 | **RunwayML** | Video generation | |
|
||||
| 113 | **Recraft** | Image generation | |
|
||||
| 114 | **DataRobot** | ML platform | |
|
||||
| 115 | **Weights & Biases** | ML ops + inference | |
|
||||
| 116 | **CompactifAI** | Model compression | |
|
||||
| 117 | **GradientAI** | Fine-tuning | |
|
||||
| 118 | **Topaz** | AI platform | |
|
||||
| 119 | **Synthetic** | Data generation | |
|
||||
| 120 | **Infiniai** | Inference | |
|
||||
| 121 | **Higress** | AI gateway | Alibaba |
|
||||
| 122 | **PPIO** | Inference | |
|
||||
| 123 | **Qiniu** | Chinese cloud AI | |
|
||||
| 124 | **NanoGPT** | Lightweight inference | |
|
||||
| 125 | **Morph** | AI platform | |
|
||||
| 126 | **Milvus** | Vector DB + AI | |
|
||||
| 127 | **XiaoMi MiMo** | Xiaomi AI | |
|
||||
| 128 | **Petals** | Distributed inference | |
|
||||
| 129 | **ZeroOne** | AI platform | |
|
||||
| 130 | **Lemonade** | AI platform | |
|
||||
| 131 | **Taichu** | Chinese AI | |
|
||||
| 132 | **Amazon Nova** | AWS native models | |
|
||||
|
||||
---
|
||||
|
||||
## 4. API Key Patterns by Provider
|
||||
|
||||
### 4.1 Confirmed Key Prefixes & Formats
|
||||
|
||||
| Provider | Prefix | Regex Pattern | Confidence |
|
||||
|----------|--------|---------------|------------|
|
||||
| **OpenAI (legacy)** | `sk-` | `sk-[a-zA-Z0-9]{48}` | High |
|
||||
| **OpenAI (project)** | `sk-proj-` | `sk-proj-[a-zA-Z0-9_-]{80,}` | High |
|
||||
| **OpenAI (service account)** | `sk-svcacct-` | `sk-svcacct-[a-zA-Z0-9_-]{80,}` | High |
|
||||
| **OpenAI (legacy user)** | `sk-None-` | `sk-None-[a-zA-Z0-9_-]{80,}` | High |
|
||||
| **Anthropic (API)** | `sk-ant-api03-` | `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA` | High |
|
||||
| **Anthropic (Admin)** | `sk-ant-admin01-` | `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA` | High |
|
||||
| **Google AI / Gemini** | `AIza` | `AIza[0-9A-Za-z\-_]{35}` | High |
|
||||
| **HuggingFace (user)** | `hf_` | `hf_[a-zA-Z]{34}` | High |
|
||||
| **HuggingFace (org)** | `api_org_` | `api_org_[a-zA-Z]{34}` | High |
|
||||
| **Groq** | `gsk_` | `gsk_[a-zA-Z0-9]{48,}` | High |
|
||||
| **Replicate** | `r8_` | `r8_[a-zA-Z0-9]{40}` | High |
|
||||
| **Fireworks AI** | `fw_` | `fw_[a-zA-Z0-9_-]{40,}` | Medium |
|
||||
| **Perplexity** | `pplx-` | `pplx-[a-zA-Z0-9]{48}` | High |
|
||||
| **AWS (general)** | `AKIA` | `AKIA[0-9A-Z]{16}` | High |
|
||||
| **GitHub PAT** | `ghp_` | `ghp_[a-zA-Z0-9]{36}` | High |
|
||||
| **Stripe (secret)** | `sk_live_` | `sk_live_[0-9a-zA-Z]{24}` | High |
|
||||
|
||||
### 4.2 Providers with No Known Distinct Prefix
|
||||
|
||||
These providers use generic-looking API keys without distinguishing prefixes, making detection harder:
|
||||
|
||||
| Provider | Key Format | Detection Approach |
|
||||
|----------|-----------|-------------------|
|
||||
| **Mistral AI** | Generic alphanumeric | Keyword-based (`MISTRAL_API_KEY`) |
|
||||
| **Cohere** | Generic alphanumeric | Keyword-based (`COHERE_API_KEY`, `CO_API_KEY`) |
|
||||
| **Together AI** | Generic alphanumeric | Keyword-based |
|
||||
| **DeepSeek** | `sk-` prefix (same as OpenAI legacy) | Keyword context needed |
|
||||
| **Azure OpenAI** | 32-char hex | Keyword-based |
|
||||
| **Stability AI** | `sk-` prefix | Keyword context needed |
|
||||
| **AI21** | Generic alphanumeric | Keyword-based |
|
||||
| **Cerebras** | Generic alphanumeric | Keyword-based |
|
||||
| **SambaNova** | Generic alphanumeric | Keyword-based |
|
||||
|
||||
### 4.3 Detection Difficulty Tiers
|
||||
|
||||
**Easy (unique prefix):** OpenAI (sk-proj-, sk-svcacct-), Anthropic (sk-ant-), HuggingFace (hf_), Groq (gsk_), Replicate (r8_), Perplexity (pplx-), AWS (AKIA)
|
||||
|
||||
**Medium (shared or short prefix):** OpenAI legacy (sk-), DeepSeek (sk-), Stability (sk-), Fireworks (fw_), Google (AIza)
|
||||
|
||||
**Hard (no prefix, keyword-only):** Mistral, Cohere, Together AI, Azure OpenAI, AI21, Cerebras, most Chinese providers
|
||||
|
||||
---
|
||||
|
||||
## 5. Key Validation Approaches
|
||||
|
||||
### 5.1 Common Validation Endpoints
|
||||
|
||||
| Provider | Validation Method | Endpoint | Cost |
|
||||
|----------|-------------------|----------|------|
|
||||
| **OpenAI** | List models | `GET /v1/models` | Free (no tokens consumed) |
|
||||
| **Anthropic** | Send minimal message | `POST /v1/messages` (tiny prompt) | Minimal cost (~1 token) |
|
||||
| **Google Gemini** | List models | `GET /v1/models` | Free |
|
||||
| **Cohere** | Token check | `POST /v1/tokenize` or `/v1/generate` | Minimal |
|
||||
| **HuggingFace** | Whoami | `GET /api/whoami` | Free |
|
||||
| **Groq** | List models | `GET /v1/models` | Free |
|
||||
| **Replicate** | Get account | `GET /v1/account` | Free |
|
||||
| **Mistral** | List models | `GET /v1/models` | Free |
|
||||
| **AWS** | STS GetCallerIdentity | `POST sts.amazonaws.com` | Free |
|
||||
| **Azure OpenAI** | List deployments | `GET /openai/deployments` | Free |
|
||||
|
||||
### 5.2 Validation Strategy Patterns
|
||||
|
||||
1. **Passive detection (regex only):** Fastest, highest false positive rate. Used by Gitleaks, detect-secrets baseline mode.
|
||||
|
||||
2. **Passive + entropy:** Combines regex with entropy scoring. Reduces false positives for generic patterns. Used by detect-secrets with entropy plugins.
|
||||
|
||||
3. **Active verification (API call):** Makes lightweight API call to confirm key is live. Used by TruffleHog, GitHub secret scanning. Eliminates false positives but requires network access.
|
||||
|
||||
4. **Deep analysis (permission enumeration):** Beyond verification, enumerates what the key can access. Used by TruffleHog for ~20 credential types. Most actionable but slowest.
|
||||
|
||||
### 5.3 How Existing Tools Validate
|
||||
|
||||
| Tool | Passive | Entropy | Active Verification | Permission Analysis |
|
||||
|------|:-------:|:-------:|:-------------------:|:-------------------:|
|
||||
| TruffleHog | Yes | No | Yes (800+ detectors) | Yes (~20 types) |
|
||||
| Gitleaks | Yes | Optional | No | No |
|
||||
| detect-secrets | Yes | Yes | Limited | No |
|
||||
| Nosey Parker | Yes | ML-based | No | No |
|
||||
| GitGuardian | Yes | Yes | Yes (selected) | Limited |
|
||||
| GitHub Scanning | Yes | AI-based | Yes (selected) | No |
|
||||
| SecurityWall | Yes | No | Generates curl cmds | No |
|
||||
| KeyHacks | No | No | Manual curl cmds | Limited |
|
||||
|
||||
---
|
||||
|
||||
## 6. Market Gaps & Opportunities
|
||||
|
||||
### 6.1 Underserved Areas
|
||||
|
||||
1. **LLM-specific comprehensive scanner:** No tool covers all 50+ LLM API providers with both detection and validation.
|
||||
|
||||
2. **New key format coverage:** OpenAI's `sk-proj-` and `sk-svcacct-` formats are recent; many scanners only detect legacy `sk-` format. Gitleaks only added these in late 2025 via PR #1780.
|
||||
|
||||
3. **Chinese/regional provider detection:** Almost zero coverage for Qwen, Baichuan, Zhipu, Moonshot, Yi, ERNIE, Doubao API keys in any scanner.
|
||||
|
||||
4. **Key metadata extraction:** No tool extracts org, project, rate limits, or spend from detected LLM keys.
|
||||
|
||||
5. **Agentic AI context:** With AI agents increasingly using API keys, there's a growing need for scanners that understand multi-key configurations (e.g., an agent with OpenAI + Anthropic + Serp API keys).
|
||||
|
||||
6. **Vibe coding exposure:** VibeFactory's scanner addresses the problem of API keys exposed in frontend JavaScript by vibe-coded apps, but this is still nascent.
|
||||
|
||||
### 6.2 Scale of the Problem
|
||||
|
||||
- **28 million credentials leaked on GitHub in 2025** (Snyk)
|
||||
- **1,275,105 leaked AI service secrets in 2025** (GitGuardian), up 81% YoY
|
||||
- **8 of 10 fastest-growing leaked secret categories are AI-related** (GitGuardian)
|
||||
- Fastest growing: Brave Search API (+1,255%), Firecrawl (+796%), Supabase (+992%)
|
||||
- AI keys are found at **42.28 per million commits** for Groq alone (GitGuardian)
|
||||
|
||||
### 6.3 Competitive Landscape Summary
|
||||
|
||||
```
|
||||
Verification Depth
|
||||
|
|
||||
TruffleHog | ████████████████ (800+ detectors, deep analysis)
|
||||
GitGuardian | ████████████ (450+ detectors, commercial)
|
||||
GitHub | ██████████ (AI-powered, platform-locked)
|
||||
Gitleaks | ████ (150+ regex, no verification)
|
||||
detect-sec | ███ (27 plugins, baseline approach)
|
||||
NoseyParker | ██ (188 rules, ML denoising, retired)
|
||||
|
|
||||
+------ LLM Provider Coverage ------>
|
||||
|
||||
None of these tools provide >15 LLM provider detectors.
|
||||
The market opportunity is a scanner focused on 50-100+ LLM providers
|
||||
with active verification, permission analysis, and cost estimation.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sources
|
||||
|
||||
### Open-Source Scanner Tools
|
||||
- [TruffleHog - GitHub](https://github.com/trufflesecurity/trufflehog)
|
||||
- [TruffleHog Detectors](https://trufflesecurity.com/detectors)
|
||||
- [Gitleaks - GitHub](https://github.com/gitleaks/gitleaks)
|
||||
- [Gitleaks Config (gitleaks.toml)](https://github.com/gitleaks/gitleaks/blob/master/config/gitleaks.toml)
|
||||
- [detect-secrets - GitHub](https://github.com/Yelp/detect-secrets)
|
||||
- [Nosey Parker - GitHub](https://github.com/praetorian-inc/noseyparker)
|
||||
- [KeyHacks - GitHub](https://github.com/streaak/keyhacks)
|
||||
- [Secrets Patterns DB - GitHub](https://github.com/mazen160/secrets-patterns-db)
|
||||
- [regextokens - GitHub](https://github.com/odomojuli/regextokens)
|
||||
- [Betterleaks - Gitleaks Successor](https://www.aikido.dev/blog/betterleaks-gitleaks-successor)
|
||||
|
||||
### Comparison & Analysis
|
||||
- [TruffleHog vs Gitleaks Comparison (Jit)](https://www.jit.io/resources/appsec-tools/trufflehog-vs-gitleaks-a-detailed-comparison-of-secret-scanning-tools)
|
||||
- [Best Secret Scanning Tools 2025 (Aikido)](https://www.aikido.dev/blog/top-secret-scanning-tools)
|
||||
- [8 Best Secret Scanning Tools 2026 (AppSec Santa)](https://appsecsanta.com/sast-tools/secret-scanning-tools)
|
||||
- [Secret Scanning Tools 2026 (GitGuardian)](https://blog.gitguardian.com/secret-scanning-tools/)
|
||||
|
||||
### API Key Patterns & Validation
|
||||
- [OpenAI API Key Format Discussion](https://community.openai.com/t/regex-s-to-validate-api-key-and-org-id-format/44619)
|
||||
- [OpenAI sk-proj Key Format](https://community.openai.com/t/how-to-create-an-api-secret-key-with-prefix-sk-only-always-creates-sk-proj-keys/1263531)
|
||||
- [Gitleaks OpenAI Regex PR #1780](https://github.com/gitleaks/gitleaks/pull/1780)
|
||||
- [GitHub Leaked API Keys Patterns](https://gist.github.com/win3zz/0a1c70589fcbea64dba4588b93095855)
|
||||
- [GitGuardian Groq API Key Detector](https://docs.gitguardian.com/secrets-detection/secrets-detection-engine/detectors/specifics/groq_api_key)
|
||||
|
||||
### LLM Key Validation Tools
|
||||
- [TestMyAPIKey.com](https://www.testmyapikey.com/)
|
||||
- [SecurityWall API Key Checker](https://securitywall.co/tools/api-key-checker)
|
||||
- [VibeFactory API Key Scanner](https://vibefactory.ai/api-key-security-scanner)
|
||||
- [KeyLeak Detector - GitHub](https://github.com/Amal-David/keyleak-detector)
|
||||
|
||||
### LLM Provider Lists
|
||||
- [LiteLLM Providers (107)](https://docs.litellm.ai/docs/providers)
|
||||
- [Langbase Supported Providers](https://langbase.com/docs/supported-models-and-providers)
|
||||
- [LLM-Interface API Keys Doc](https://github.com/samestrin/llm-interface/blob/main/docs/api-keys.md)
|
||||
- [Artificial Analysis Provider Leaderboard](https://artificialanalysis.ai/leaderboards/providers)
|
||||
- [Top LLM API Providers 2026 (Future AGI)](https://futureagi.substack.com/p/top-11-llm-api-providers-in-2026)
|
||||
|
||||
### GitHub Secret Scanning
|
||||
- [GitHub Supported Secret Scanning Patterns](https://docs.github.com/en/code-security/secret-scanning/introduction/supported-secret-scanning-patterns)
|
||||
- [GitHub Adds 37 New Detectors (March 2026)](https://devops.com/github-adds-37-new-secret-detectors-in-march-extends-scanning-to-ai-coding-agents/)
|
||||
- [GitHub Secret Scanning Coverage Update](https://github.blog/changelog/2026-03-31-github-secret-scanning-nine-new-types-and-more/)
|
||||
|
||||
### Market Data
|
||||
- [State of Secrets Sprawl 2026 (GitGuardian/Hacker News)](https://thehackernews.com/2026/03/the-state-of-secrets-sprawl-2026-9.html)
|
||||
- [Why 28M Credentials Leaked on GitHub in 2025 (Snyk)](https://snyk.io/articles/state-of-secrets/)
|
||||
- [GitGuardian AI Security](https://www.gitguardian.com/agentic-ai-security)
|
||||
556
docs/superpowers/specs/2026-04-04-keyhunter-design.md
Normal file
556
docs/superpowers/specs/2026-04-04-keyhunter-design.md
Normal file
@@ -0,0 +1,556 @@
|
||||
# KeyHunter - Design Specification
|
||||
|
||||
## Overview
|
||||
|
||||
KeyHunter is a comprehensive, modular API key scanner built in Go, focused on detecting and validating API keys from 100+ LLM/AI providers. It combines native scanning capabilities with external tool integration (TruffleHog, Gitleaks), OSINT/recon modules, a web dashboard, and Telegram bot notifications.
|
||||
|
||||
## Architecture
|
||||
|
||||
**Approach:** Plugin-based architecture. Core scanner engine with providers defined as YAML files (compile-time embedded). Single binary distribution.
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
keyhunter/
|
||||
├── cmd/keyhunter/ # CLI entrypoint (cobra)
|
||||
├── pkg/
|
||||
│ ├── engine/ # Core scanning engine
|
||||
│ │ ├── scanner.go # Orchestrator - input alir, provider'lari calistirir
|
||||
│ │ ├── matcher.go # Regex + entropy matching
|
||||
│ │ └── verifier.go # Active key verification (--verify flag)
|
||||
│ ├── provider/ # Provider registry & loader
|
||||
│ │ ├── registry.go # Provider'lari yukler ve yonetir
|
||||
│ │ ├── types.go # Provider interface tanimlari
|
||||
│ │ └── builtin/ # Compile-time embedded provider YAML'lari
|
||||
│ ├── input/ # Input source adapters
|
||||
│ │ ├── file.go # Dosya/dizin tarama
|
||||
│ │ ├── git.go # Git history/diff tarama
|
||||
│ │ ├── stdin.go # Pipe/stdin destegi
|
||||
│ │ ├── url.go # URL fetch
|
||||
│ │ └── remote.go # GitHub/GitLab API, paste siteleri
|
||||
│ ├── output/ # Output formatters
|
||||
│ │ ├── table.go # Renkli terminal tablo
|
||||
│ │ ├── json.go # JSON export
|
||||
│ │ ├── sarif.go # SARIF (CI/CD uyumlu)
|
||||
│ │ └── csv.go # CSV export
|
||||
│ ├── adapter/ # External tool parsers
|
||||
│ │ ├── trufflehog.go # TruffleHog JSON output parser
|
||||
│ │ └── gitleaks.go # Gitleaks JSON output parser
|
||||
│ ├── recon/ # OSINT/Recon engine (80+ sources)
|
||||
│ │ ├── engine.go # Recon orchestrator
|
||||
│ │ ├── ratelimit.go # Rate limiting & politeness
|
||||
│ │ │
|
||||
│ │ │ # --- IoT & Internet Search Engines ---
|
||||
│ │ ├── shodan.go # Shodan API client
|
||||
│ │ ├── censys.go # Censys API client
|
||||
│ │ ├── zoomeye.go # ZoomEye (Chinese IoT scanner)
|
||||
│ │ ├── fofa.go # FOFA (Chinese IoT scanner)
|
||||
│ │ ├── netlas.go # Netlas.io (HTTP body search)
|
||||
│ │ ├── binaryedge.go # BinaryEdge scanner
|
||||
│ │ │
|
||||
│ │ │ # --- Code Hosting & Snippets ---
|
||||
│ │ ├── github.go # GitHub code search / dorks
|
||||
│ │ ├── gitlab.go # GitLab search
|
||||
│ │ ├── gist.go # GitHub Gist search
|
||||
│ │ ├── bitbucket.go # Bitbucket code search
|
||||
│ │ ├── codeberg.go # Codeberg/Gitea search
|
||||
│ │ ├── gitea.go # Self-hosted Gitea instances
|
||||
│ │ ├── replit.go # Replit public repls
|
||||
│ │ ├── codesandbox.go # CodeSandbox projects
|
||||
│ │ ├── stackblitz.go # StackBlitz projects
|
||||
│ │ ├── codepen.go # CodePen pens
|
||||
│ │ ├── jsfiddle.go # JSFiddle snippets
|
||||
│ │ ├── glitch.go # Glitch public projects
|
||||
│ │ ├── observable.go # Observable notebooks
|
||||
│ │ ├── huggingface.go # HuggingFace Spaces/repos
|
||||
│ │ ├── kaggle.go # Kaggle notebooks/datasets
|
||||
│ │ ├── jupyter.go # nbviewer / Jupyter notebooks
|
||||
│ │ ├── gitpod.go # Gitpod workspace snapshots
|
||||
│ │ │
|
||||
│ │ │ # --- Search Engine Dorking ---
|
||||
│ │ ├── google.go # Google Custom Search / SerpAPI dorking
|
||||
│ │ ├── bing.go # Bing Web Search API dorking
|
||||
│ │ ├── duckduckgo.go # DuckDuckGo search
|
||||
│ │ ├── yandex.go # Yandex XML Search
|
||||
│ │ ├── brave.go # Brave Search API
|
||||
│ │ │
|
||||
│ │ │ # --- Paste Sites ---
|
||||
│ │ ├── paste.go # Multi-paste aggregator (pastebin, dpaste, paste.ee, rentry, hastebin, ix.io, etc.)
|
||||
│ │ │
|
||||
│ │ │ # --- Package Registries ---
|
||||
│ │ ├── npm.go # npm registry scanning
|
||||
│ │ ├── pypi.go # PyPI package scanning
|
||||
│ │ ├── rubygems.go # RubyGems scanning
|
||||
│ │ ├── crates.go # crates.io (Rust)
|
||||
│ │ ├── maven.go # Maven Central (Java)
|
||||
│ │ ├── nuget.go # NuGet (.NET)
|
||||
│ │ ├── packagist.go # Packagist (PHP)
|
||||
│ │ ├── goproxy.go # Go module proxy
|
||||
│ │ │
|
||||
│ │ │ # --- Container & Infra ---
|
||||
│ │ ├── docker.go # Docker Hub image/layer scanning
|
||||
│ │ ├── kubernetes.go # Exposed K8s dashboards & configs
|
||||
│ │ ├── terraform.go # Terraform state files & registry
|
||||
│ │ ├── helm.go # Artifact Hub / Helm charts
|
||||
│ │ ├── ansible.go # Ansible Galaxy collections
|
||||
│ │ │
|
||||
│ │ │ # --- Cloud Storage ---
|
||||
│ │ ├── s3.go # AWS S3 bucket enumeration
|
||||
│ │ ├── gcs.go # Google Cloud Storage buckets
|
||||
│ │ ├── azureblob.go # Azure Blob Storage
|
||||
│ │ ├── spaces.go # DigitalOcean Spaces
|
||||
│ │ ├── backblaze.go # Backblaze B2
|
||||
│ │ ├── minio.go # Self-hosted MinIO instances
|
||||
│ │ ├── grayhat.go # GrayHatWarfare (bucket search engine)
|
||||
│ │ │
|
||||
│ │ │ # --- CI/CD Log Leaks ---
|
||||
│ │ ├── travisci.go # Travis CI public build logs
|
||||
│ │ ├── circleci.go # CircleCI build logs
|
||||
│ │ ├── ghactions.go # GitHub Actions workflow logs
|
||||
│ │ ├── jenkins.go # Exposed Jenkins instances
|
||||
│ │ ├── gitlabci.go # GitLab CI/CD pipeline logs
|
||||
│ │ │
|
||||
│ │ │ # --- Web Archives ---
|
||||
│ │ ├── wayback.go # Wayback Machine CDX API
|
||||
│ │ ├── commoncrawl.go # CommonCrawl index & WARC
|
||||
│ │ │
|
||||
│ │ │ # --- Forums & Documentation ---
|
||||
│ │ ├── stackoverflow.go # Stack Overflow / Stack Exchange API
|
||||
│ │ ├── reddit.go # Reddit search
|
||||
│ │ ├── hackernews.go # HN Algolia API
|
||||
│ │ ├── devto.go # dev.to articles
|
||||
│ │ ├── medium.go # Medium articles
|
||||
│ │ ├── telegram_recon.go # Telegram public channels
|
||||
│ │ ├── discord.go # Discord indexed content
|
||||
│ │ │
|
||||
│ │ │ # --- Collaboration Tools ---
|
||||
│ │ ├── notion.go # Notion public pages
|
||||
│ │ ├── confluence.go # Confluence public spaces
|
||||
│ │ ├── trello.go # Trello public boards
|
||||
│ │ ├── googledocs.go # Google Docs/Sheets public
|
||||
│ │ │
|
||||
│ │ │ # --- Frontend & JS Leaks ---
|
||||
│ │ ├── sourcemaps.go # JS source map extraction
|
||||
│ │ ├── webpack.go # Webpack/Vite bundle scanning
|
||||
│ │ ├── dotenv_web.go # Exposed .env files on web servers
|
||||
│ │ ├── swagger.go # Exposed Swagger/OpenAPI docs
|
||||
│ │ ├── deploys.go # Vercel/Netlify preview deployments
|
||||
│ │ │
|
||||
│ │ │ # --- Log Aggregators ---
|
||||
│ │ ├── elasticsearch.go # Exposed Elasticsearch/Kibana
|
||||
│ │ ├── grafana.go # Exposed Grafana dashboards
|
||||
│ │ ├── sentry.go # Exposed Sentry instances
|
||||
│ │ │
|
||||
│ │ │ # --- Threat Intelligence ---
|
||||
│ │ ├── virustotal.go # VirusTotal file/URL search
|
||||
│ │ ├── intelx.go # Intelligence X aggregated search
|
||||
│ │ ├── urlhaus.go # URLhaus abuse.ch
|
||||
│ │ │
|
||||
│ │ │ # --- Mobile Apps ---
|
||||
│ │ ├── apk.go # APK download & decompile scanning
|
||||
│ │ │
|
||||
│ │ │ # --- DNS/Subdomain ---
|
||||
│ │ ├── crtsh.go # Certificate Transparency (crt.sh)
|
||||
│ │ ├── subdomain.go # Subdomain config endpoint probing
|
||||
│ │ │
|
||||
│ │ │ # --- API Marketplaces ---
|
||||
│ │ ├── postman.go # Postman public collections/workspaces
|
||||
│ │ ├── swaggerhub.go # SwaggerHub published APIs
|
||||
│ │ └── rapidapi.go # RapidAPI public endpoints
|
||||
│ │
|
||||
│ ├── dorks/ # Dork management
|
||||
│ │ ├── loader.go # YAML dork loader
|
||||
│ │ ├── runner.go # Dork execution engine
|
||||
│ │ └── builtin/ # Embedded dork YAML'lari
|
||||
│ ├── notify/ # Notification modulleri
|
||||
│ │ ├── telegram.go # Telegram bot
|
||||
│ │ ├── webhook.go # Generic webhook
|
||||
│ │ └── slack.go # Slack
|
||||
│ └── web/ # Web dashboard
|
||||
│ ├── server.go # Embedded HTTP server
|
||||
│ ├── api.go # REST API
|
||||
│ └── static/ # Frontend assets (htmx + tailwind)
|
||||
├── providers/ # Provider YAML definitions (embed edilir)
|
||||
│ ├── openai.yaml
|
||||
│ ├── anthropic.yaml
|
||||
│ └── ... (108 provider)
|
||||
├── dorks/ # Dork YAML definitions (embed edilir)
|
||||
│ ├── github.yaml # GitHub code search dorks
|
||||
│ ├── gitlab.yaml # GitLab search dorks
|
||||
│ ├── shodan.yaml # Shodan IoT dorks
|
||||
│ ├── censys.yaml # Censys dorks
|
||||
│ ├── zoomeye.yaml # ZoomEye dorks
|
||||
│ ├── fofa.yaml # FOFA dorks
|
||||
│ ├── google.yaml # Google dorking queries
|
||||
│ ├── bing.yaml # Bing dorking queries
|
||||
│ └── generic.yaml # Multi-source keyword dorks
|
||||
├── configs/ # Ornek config dosyalari
|
||||
└── docs/
|
||||
```
|
||||
|
||||
### Data Flow
|
||||
|
||||
```
|
||||
Input Source -> Scanner Engine -> Provider Matcher -> (optional) Verifier -> Output Formatter + Notifier
|
||||
-> SQLite DB (persist)
|
||||
-> Web Dashboard (serve)
|
||||
```
|
||||
|
||||
## Provider YAML Schema
|
||||
|
||||
```yaml
|
||||
id: string # Unique provider ID
|
||||
name: string # Display name
|
||||
category: enum # frontier | mid-tier | emerging | chinese | infrastructure | gateway | self-hosted
|
||||
website: string # API base URL
|
||||
confidence: enum # high | medium | low
|
||||
|
||||
patterns:
|
||||
- id: string # Unique pattern ID
|
||||
name: string # Human-readable name
|
||||
regex: string # Detection regex
|
||||
confidence: enum # high | medium | low
|
||||
description: string # Pattern description
|
||||
|
||||
keywords: []string # Pre-filtering keywords (performance optimization)
|
||||
|
||||
verify:
|
||||
enabled: bool
|
||||
method: string # HTTP method
|
||||
url: string # Verification endpoint
|
||||
headers: map # Headers with {{key}} template
|
||||
success_codes: []int
|
||||
failure_codes: []int
|
||||
extract: # Additional info extraction on success
|
||||
- field: string
|
||||
path: string # JSON path
|
||||
|
||||
metadata:
|
||||
docs: string # API docs URL
|
||||
key_url: string # Key management URL
|
||||
env_vars: []string # Common environment variable names
|
||||
revoke_url: string # Key revocation URL
|
||||
```
|
||||
|
||||
## CLI Command Structure
|
||||
|
||||
### Core Commands
|
||||
|
||||
```bash
|
||||
# Scanning
|
||||
keyhunter scan path <dir>
|
||||
keyhunter scan file <file>
|
||||
keyhunter scan git <repo> [--since=<duration>]
|
||||
keyhunter scan stdin
|
||||
keyhunter scan url <url>
|
||||
keyhunter scan clipboard
|
||||
|
||||
# Verification
|
||||
keyhunter verify <key>
|
||||
keyhunter verify --file <keyfile>
|
||||
|
||||
# External Tool Import
|
||||
keyhunter import trufflehog <json>
|
||||
keyhunter import gitleaks <json>
|
||||
keyhunter import generic --format=csv <file>
|
||||
|
||||
# OSINT/Recon — IoT & Internet Scanners
|
||||
keyhunter recon shodan [--query|--dork]
|
||||
keyhunter recon censys [--query]
|
||||
keyhunter recon zoomeye [--query]
|
||||
keyhunter recon fofa [--query]
|
||||
keyhunter recon netlas [--query]
|
||||
keyhunter recon binaryedge [--query]
|
||||
|
||||
# OSINT/Recon — Code Hosting & Snippets
|
||||
keyhunter recon github [--dork=auto|custom]
|
||||
keyhunter recon gitlab [--dork=auto|custom]
|
||||
keyhunter recon gist [--query]
|
||||
keyhunter recon bitbucket [--query|--workspace]
|
||||
keyhunter recon codeberg [--query]
|
||||
keyhunter recon gitea [--instances-from=shodan|file]
|
||||
keyhunter recon replit [--query]
|
||||
keyhunter recon codesandbox [--query]
|
||||
keyhunter recon stackblitz [--query]
|
||||
keyhunter recon codepen [--query]
|
||||
keyhunter recon jsfiddle [--query]
|
||||
keyhunter recon glitch [--query]
|
||||
keyhunter recon huggingface [--query|--spaces|--repos]
|
||||
keyhunter recon kaggle [--query|--notebooks]
|
||||
keyhunter recon jupyter [--query]
|
||||
keyhunter recon observable [--query]
|
||||
|
||||
# OSINT/Recon — Search Engine Dorking
|
||||
keyhunter recon google [--dork=auto|custom]
|
||||
keyhunter recon bing [--dork=auto|custom]
|
||||
keyhunter recon duckduckgo [--query]
|
||||
keyhunter recon yandex [--query]
|
||||
keyhunter recon brave [--query]
|
||||
|
||||
# OSINT/Recon — Paste Sites
|
||||
keyhunter recon paste [--sources=pastebin,dpaste,paste.ee,rentry,hastebin,ix.io,all]
|
||||
|
||||
# OSINT/Recon — Package Registries
|
||||
keyhunter recon npm [--query|--recent]
|
||||
keyhunter recon pypi [--query|--recent]
|
||||
keyhunter recon rubygems [--query]
|
||||
keyhunter recon crates [--query]
|
||||
keyhunter recon maven [--query]
|
||||
keyhunter recon nuget [--query]
|
||||
keyhunter recon packagist [--query]
|
||||
keyhunter recon goproxy [--query]
|
||||
|
||||
# OSINT/Recon — Container & Infrastructure
|
||||
keyhunter recon docker [--query|--image|--layers]
|
||||
keyhunter recon kubernetes [--shodan|--github]
|
||||
keyhunter recon terraform [--github|--registry]
|
||||
keyhunter recon helm [--query]
|
||||
keyhunter recon ansible [--query]
|
||||
|
||||
# OSINT/Recon — Cloud Storage
|
||||
keyhunter recon s3 [--wordlist|--domain]
|
||||
keyhunter recon gcs [--wordlist|--domain]
|
||||
keyhunter recon azure [--wordlist|--domain]
|
||||
keyhunter recon spaces [--wordlist]
|
||||
keyhunter recon minio [--shodan]
|
||||
keyhunter recon grayhat [--query] # GrayHatWarfare bucket search
|
||||
|
||||
# OSINT/Recon — CI/CD Logs
|
||||
keyhunter recon travis [--org|--repo]
|
||||
keyhunter recon circleci [--org|--repo]
|
||||
keyhunter recon ghactions [--org|--repo]
|
||||
keyhunter recon jenkins [--shodan|--url]
|
||||
keyhunter recon gitlabci [--project]
|
||||
|
||||
# OSINT/Recon — Web Archives
|
||||
keyhunter recon wayback [--domain|--url]
|
||||
keyhunter recon commoncrawl [--domain|--pattern]
|
||||
|
||||
# OSINT/Recon — Forums & Documentation
|
||||
keyhunter recon stackoverflow [--query]
|
||||
keyhunter recon reddit [--query|--subreddit]
|
||||
keyhunter recon hackernews [--query]
|
||||
keyhunter recon devto [--query|--tag]
|
||||
keyhunter recon medium [--query]
|
||||
keyhunter recon telegram-groups [--channel|--query]
|
||||
|
||||
# OSINT/Recon — Collaboration Tools
|
||||
keyhunter recon notion [--query] # Google dorking
|
||||
keyhunter recon confluence [--shodan|--url]
|
||||
keyhunter recon trello [--query]
|
||||
keyhunter recon googledocs [--query] # Google dorking
|
||||
|
||||
# OSINT/Recon — Frontend & JS Leaks
|
||||
keyhunter recon sourcemaps [--domain|--url]
|
||||
keyhunter recon webpack [--domain|--url]
|
||||
keyhunter recon dotenv [--domain-list|--url] # Exposed .env files
|
||||
keyhunter recon swagger [--shodan|--domain]
|
||||
keyhunter recon deploys [--domain] # Vercel/Netlify previews
|
||||
|
||||
# OSINT/Recon — Log Aggregators
|
||||
keyhunter recon elasticsearch [--shodan|--url]
|
||||
keyhunter recon grafana [--shodan|--url]
|
||||
keyhunter recon sentry [--shodan|--url]
|
||||
|
||||
# OSINT/Recon — Threat Intelligence
|
||||
keyhunter recon virustotal [--query]
|
||||
keyhunter recon intelx [--query]
|
||||
keyhunter recon urlhaus [--query]
|
||||
|
||||
# OSINT/Recon — Mobile Apps
|
||||
keyhunter recon apk [--package|--query|--file]
|
||||
|
||||
# OSINT/Recon — DNS/Subdomain
|
||||
keyhunter recon crtsh [--domain]
|
||||
keyhunter recon subdomain [--domain] [--probe-configs]
|
||||
|
||||
# OSINT/Recon — API Marketplaces
|
||||
keyhunter recon postman [--query|--workspace]
|
||||
keyhunter recon swaggerhub [--query]
|
||||
|
||||
# OSINT/Recon — Full Sweep
|
||||
keyhunter recon full [--providers] [--categories=all|code|cloud|forums|cicd|...]
|
||||
|
||||
# Dork Management
|
||||
keyhunter dorks list [--source]
|
||||
keyhunter dorks add <source> <query>
|
||||
keyhunter dorks run <source> [--category]
|
||||
keyhunter dorks export
|
||||
|
||||
# Key Management (full key access)
|
||||
keyhunter keys list [--unmask] [--provider=X] [--status=active|revoked]
|
||||
keyhunter keys show <id>
|
||||
keyhunter keys export --format=json|csv
|
||||
keyhunter keys copy <id>
|
||||
keyhunter keys verify <id>
|
||||
keyhunter keys delete <id>
|
||||
|
||||
# Provider Management
|
||||
keyhunter providers list [--category]
|
||||
keyhunter providers info <id>
|
||||
keyhunter providers stats
|
||||
|
||||
# Web Dashboard & Telegram
|
||||
keyhunter serve [--port] [--telegram]
|
||||
|
||||
# Scheduled Scanning
|
||||
keyhunter schedule add --name --cron --command --notify
|
||||
keyhunter schedule list
|
||||
keyhunter schedule remove <name>
|
||||
|
||||
# Config & Hooks
|
||||
keyhunter config init
|
||||
keyhunter config set <key> <value>
|
||||
keyhunter hook install
|
||||
keyhunter hook uninstall
|
||||
```
|
||||
|
||||
### Scan Flags
|
||||
|
||||
```
|
||||
--providers=<list> Filter by provider IDs
|
||||
--category=<cat> Filter by provider category
|
||||
--confidence=<level> Minimum confidence level
|
||||
--exclude=<patterns> Exclude file patterns
|
||||
--verify Enable active key verification
|
||||
--verify-timeout=<dur> Verification timeout (default: 10s)
|
||||
--workers=<n> Parallel workers (default: CPU count)
|
||||
--output=<format> Output format: table|json|sarif|csv
|
||||
--unmask Show full API keys without masking (default: masked)
|
||||
--notify=<channel> Send results to: telegram|webhook|slack
|
||||
--stealth Stealth mode: UA rotation, increased delays
|
||||
--respect-robots Respect robots.txt (default: true)
|
||||
```
|
||||
|
||||
### Exit Codes
|
||||
|
||||
- `0` — Clean, no keys found
|
||||
- `1` — Keys found
|
||||
- `2` — Error
|
||||
|
||||
## Dork YAML Schema
|
||||
|
||||
```yaml
|
||||
source: string # github | gitlab | shodan | censys
|
||||
dorks:
|
||||
- id: string
|
||||
query: string # Search query
|
||||
description: string
|
||||
providers: []string # Optional: related provider IDs
|
||||
```
|
||||
|
||||
Built-in dork categories: GitHub (code search, filename, language), GitLab (snippets, projects), Shodan (exposed proxies, dashboards), Censys (HTTP body search).
|
||||
|
||||
## Web Dashboard
|
||||
|
||||
**Stack:** Go embed + htmx + Tailwind CSS (zero JS framework dependency)
|
||||
|
||||
**Pages:**
|
||||
- `/` — Dashboard overview with summary statistics
|
||||
- `/scans` — Scan history list
|
||||
- `/scans/:id` — Scan detail with found keys
|
||||
- `/keys` — All found keys (filterable table)
|
||||
- `/keys/:id` — Key detail (provider, confidence, verify status)
|
||||
- `/recon` — OSINT scan launcher and results
|
||||
- `/providers` — Provider list and statistics
|
||||
- `/dorks` — Dork management
|
||||
- `/settings` — Configuration (tokens, API keys)
|
||||
- `/api/v1/*` — REST API for programmatic access
|
||||
|
||||
**Storage:** SQLite (embedded, AES-256 encrypted)
|
||||
|
||||
## Telegram Bot
|
||||
|
||||
**Commands:**
|
||||
- `/scan <url/path>` — Remote scan trigger
|
||||
- `/verify <key>` — Key verification
|
||||
- `/recon github <dork>` — GitHub dork execution
|
||||
- `/status` — Active scan status
|
||||
- `/stats` — General statistics
|
||||
- `/subscribe` — Auto-notification on new key findings
|
||||
- `/unsubscribe` — Disable notifications
|
||||
- `/providers` — Provider list
|
||||
- `/help` — Help
|
||||
|
||||
**Auto-notifications:** New key found, recon complete, scheduled scan results, verify results.
|
||||
|
||||
## LLM Provider Coverage (108 Providers)
|
||||
|
||||
### Tier 1 — Frontier (12)
|
||||
OpenAI, Anthropic, Google AI (Gemini), Google Vertex AI, AWS Bedrock, Azure OpenAI, Meta AI (Llama API), xAI (Grok), Cohere, Mistral AI, Inflection AI, AI21 Labs
|
||||
|
||||
### Tier 2 — Inference Platforms (14)
|
||||
Together AI, Fireworks AI, Groq, Replicate, Anyscale, DeepInfra, Lepton AI, Modal, Baseten, Cerebrium, NovitaAI, Sambanova, OctoAI, Friendli AI
|
||||
|
||||
### Tier 3 — Specialized/Vertical (12)
|
||||
Perplexity, You.com, Voyage AI, Jina AI, Unstructured, AssemblyAI, Deepgram, ElevenLabs, Stability AI, Runway ML, Midjourney, HuggingFace
|
||||
|
||||
### Tier 4 — Chinese/Regional (16)
|
||||
DeepSeek, Baichuan, Zhipu AI (GLM), Moonshot AI (Kimi), Yi (01.AI), Qwen (Alibaba Cloud), Baidu (ERNIE/Wenxin), ByteDance (Doubao), SenseTime, iFlytek (Spark), MiniMax, Stepfun, 360 AI, Kuaishou (Kling), Tencent Hunyuan, SiliconFlow
|
||||
|
||||
### Tier 5 — Infrastructure/Gateway (11)
|
||||
Cloudflare AI, Vercel AI, LiteLLM, Portkey, Helicone, OpenRouter, Martian, AI Gateway (Kong), BricksAI, Aether, Not Diamond
|
||||
|
||||
### Tier 6 — Emerging/Niche (15)
|
||||
Reka AI, Aleph Alpha, Writer, Jasper AI, Typeface, Comet ML, Weights & Biases, LangSmith (LangChain), Pinecone, Weaviate, Qdrant, Chroma, Milvus, Neon AI, Lamini
|
||||
|
||||
### Tier 7 — Code & Dev Tools (10)
|
||||
GitHub Copilot, Cursor, Tabnine, Codeium/Windsurf, Sourcegraph Cody, Amazon CodeWhisperer, Replit AI, Codestral (Mistral), IBM watsonx.ai, Oracle AI
|
||||
|
||||
### Tier 8 — Self-Hosted/Open Infra (10)
|
||||
Ollama, vLLM, LocalAI, LM Studio, llama.cpp, GPT4All, text-generation-webui, TensorRT-LLM, Triton Inference Server, Jan AI
|
||||
|
||||
### Tier 9 — Enterprise/Legacy (8)
|
||||
Salesforce Einstein, ServiceNow AI, SAP AI Core, Palantir AIP, Databricks (DBRX), Snowflake Cortex, Oracle Generative AI, HPE GreenLake AI
|
||||
|
||||
## Performance
|
||||
|
||||
- Worker pool: parallel scanning (default: CPU count, configurable via `--workers=N`)
|
||||
- Keyword pre-filtering before regex (10x speedup on large files)
|
||||
- `mmap` for large file reading
|
||||
- Delta-based git scanning (only changed files between commits)
|
||||
- Source-based rate limiting in recon module
|
||||
|
||||
## Key Visibility & Access
|
||||
|
||||
Full (unmasked) API keys are accessible through multiple channels:
|
||||
|
||||
1. **CLI `--unmask` flag** — `keyhunter scan path . --unmask` shows full keys in terminal table
|
||||
2. **JSON/CSV/SARIF export** — Always contains full keys: `keyhunter scan path . -o json`
|
||||
3. **`keyhunter keys` command** — Dedicated key management:
|
||||
- `keyhunter keys list` — all found keys (masked by default)
|
||||
- `keyhunter keys list --unmask` — all found keys (full)
|
||||
- `keyhunter keys show <id>` — single key full detail (always unmasked)
|
||||
- `keyhunter keys export --format=json` — export all keys with full values
|
||||
- `keyhunter keys copy <id>` — copy full key to clipboard
|
||||
- `keyhunter keys verify <id>` — verify and show full detail
|
||||
4. **Web Dashboard** — `/keys/:id` detail page with "Reveal Key" toggle button (auth required)
|
||||
5. **Telegram Bot** — `/key <id>` returns full key detail in private chat
|
||||
6. **SQLite DB** — Full keys always stored (encrypted), queryable via API
|
||||
|
||||
Default behavior: masked in terminal for shoulder-surfing protection.
|
||||
When you need the real key (to test, verify, or report): `--unmask`, JSON export, or `keys show`.
|
||||
|
||||
## Security
|
||||
|
||||
- Key masking in terminal output by default (first 8 + last 4 chars, middle `***`)
|
||||
- `--unmask` flag to reveal full keys when needed
|
||||
- SQLite database AES-256 encrypted (full keys stored encrypted)
|
||||
- Telegram/Shodan tokens encrypted in config
|
||||
- No key values written to logs during `--verify`
|
||||
- Optional basic auth / token auth for web dashboard
|
||||
|
||||
## Rate Limiting & Ethics
|
||||
|
||||
- GitHub API: 30 req/min (auth), 10 req/min (unauth)
|
||||
- Shodan/Censys: respect API plan limits
|
||||
- Paste sites: 1 req/2sec politeness delay
|
||||
- `--stealth` flag: UA rotation, increased spacing
|
||||
- `--respect-robots`: robots.txt compliance (default: on)
|
||||
|
||||
## Error Handling
|
||||
|
||||
- Verify timeout: 10s default, configurable
|
||||
- Network errors: 3 retries with exponential backoff
|
||||
- Partial results: failed sources don't block others
|
||||
- Graceful degradation on all external dependencies
|
||||
107
pkg/recon/sources/deploypreview.go
Normal file
107
pkg/recon/sources/deploypreview.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// DeployPreviewSource scans Vercel and Netlify deploy preview URLs for leaked
|
||||
// API keys. Deploy previews frequently use different (less restrictive)
|
||||
// environment variables than production, and their URLs are often guessable
|
||||
// from PR numbers or commit hashes.
|
||||
type DeployPreviewSource struct {
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*DeployPreviewSource)(nil)
|
||||
|
||||
func (s *DeployPreviewSource) Name() string { return "deploypreview" }
|
||||
func (s *DeployPreviewSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
|
||||
func (s *DeployPreviewSource) Burst() int { return 2 }
|
||||
func (s *DeployPreviewSource) RespectsRobots() bool { return true }
|
||||
func (s *DeployPreviewSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// deployPreviewPaths are paths where deploy previews expose build artifacts.
|
||||
var deployPreviewPaths = []string{
|
||||
"/",
|
||||
"/_next/data/",
|
||||
"/static/js/main.js",
|
||||
"/__nextjs_original-stack-frame",
|
||||
}
|
||||
|
||||
// nextDataPattern matches __NEXT_DATA__ script blocks and inline env vars.
|
||||
var nextDataPattern = regexp.MustCompile(`(?i)(__NEXT_DATA__|NEXT_PUBLIC_|REACT_APP_|VITE_)[A-Z_]*(API[_]?KEY|SECRET|TOKEN)?['":\s]*[=:,]\s*['"]([a-zA-Z0-9_\-]{8,})['"]`)
|
||||
|
||||
func (s *DeployPreviewSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "deploypreview")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, path := range deployPreviewPaths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
probeURL := base + path
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 512*1024))
|
||||
_ = resp.Body.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if nextDataPattern.Match(body) {
|
||||
out <- recon.Finding{
|
||||
ProviderName: q,
|
||||
Source: probeURL,
|
||||
SourceType: "recon:deploypreview",
|
||||
Confidence: "medium",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
break // one finding per query is sufficient
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
158
pkg/recon/sources/deploypreview_test.go
Normal file
158
pkg/recon/sources/deploypreview_test.go
Normal file
@@ -0,0 +1,158 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func deployPreviewTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
const deployPreviewFixtureHTML = `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>My App</title></head>
|
||||
<body>
|
||||
<div id="__next"></div>
|
||||
<script id="__NEXT_DATA__" type="application/json">
|
||||
{
|
||||
"props": {
|
||||
"pageProps": {
|
||||
"config": {
|
||||
"NEXT_PUBLIC_API_KEY": "sk-proj-abc123def456ghi789jkl"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
const deployPreviewCleanHTML = `<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>My App</title></head>
|
||||
<body>
|
||||
<div id="root">Hello World</div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
func TestDeployPreview_Sweep_ExtractsFindings(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(deployPreviewFixtureHTML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &DeployPreviewSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: deployPreviewTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least one finding")
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:deploypreview" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "medium" {
|
||||
t.Errorf("unexpected Confidence: %s", f.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployPreview_Sweep_NoFindings_OnCleanPage(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(deployPreviewCleanHTML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &DeployPreviewSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: deployPreviewTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var count int
|
||||
for range out {
|
||||
count++
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("expected 0 findings, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployPreview_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(deployPreviewFixtureHTML))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &DeployPreviewSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: deployPreviewTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployPreview_EnabledAlwaysTrue(t *testing.T) {
|
||||
s := &DeployPreviewSource{}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeployPreview_NameAndRate(t *testing.T) {
|
||||
s := &DeployPreviewSource{}
|
||||
if s.Name() != "deploypreview" {
|
||||
t.Errorf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if s.Burst() != 2 {
|
||||
t.Errorf("burst: %d", s.Burst())
|
||||
}
|
||||
if !s.RespectsRobots() {
|
||||
t.Error("expected RespectsRobots=true")
|
||||
}
|
||||
}
|
||||
111
pkg/recon/sources/envleak.go
Normal file
111
pkg/recon/sources/envleak.go
Normal file
@@ -0,0 +1,111 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// EnvLeakSource probes for publicly accessible .env files on web servers.
|
||||
// Many web frameworks (Laravel, Rails, Node/Express, Django) use .env files
|
||||
// for configuration. Misconfigured servers frequently serve these files
|
||||
// directly, exposing API keys and database credentials.
|
||||
type EnvLeakSource struct {
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*EnvLeakSource)(nil)
|
||||
|
||||
func (s *EnvLeakSource) Name() string { return "dotenv" }
|
||||
func (s *EnvLeakSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
|
||||
func (s *EnvLeakSource) Burst() int { return 2 }
|
||||
func (s *EnvLeakSource) RespectsRobots() bool { return true }
|
||||
func (s *EnvLeakSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// envKeyValuePattern matches KEY=VALUE lines typical of .env files.
|
||||
var envKeyValuePattern = regexp.MustCompile(`(?im)^[A-Z_]*(API[_]?KEY|SECRET|TOKEN|PASSWORD|CREDENTIALS?)[A-Z_]*\s*=\s*\S+`)
|
||||
|
||||
// envFilePaths are common locations for exposed .env files.
|
||||
var envFilePaths = []string{
|
||||
"/.env",
|
||||
"/.env.local",
|
||||
"/.env.production",
|
||||
"/.env.development",
|
||||
"/.env.backup",
|
||||
"/.env.example",
|
||||
"/app/.env",
|
||||
"/api/.env",
|
||||
}
|
||||
|
||||
func (s *EnvLeakSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "dotenv")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, path := range envFilePaths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
probeURL := fmt.Sprintf("%s%s", base, path)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) // 64KB max
|
||||
_ = resp.Body.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if envKeyValuePattern.Match(body) {
|
||||
out <- recon.Finding{
|
||||
ProviderName: q,
|
||||
Source: probeURL,
|
||||
SourceType: "recon:dotenv",
|
||||
Confidence: "high",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
145
pkg/recon/sources/envleak_test.go
Normal file
145
pkg/recon/sources/envleak_test.go
Normal file
@@ -0,0 +1,145 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func envLeakTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
const envLeakFixture = `# Application config
|
||||
APP_NAME=myapp
|
||||
DATABASE_URL=postgres://user:pass@localhost/db
|
||||
OPENAI_API_KEY=sk-proj-abc123def456ghi789
|
||||
AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
|
||||
DEBUG=false
|
||||
`
|
||||
|
||||
const envLeakCleanFixture = `# Nothing sensitive here
|
||||
APP_NAME=myapp
|
||||
DEBUG=false
|
||||
LOG_LEVEL=info
|
||||
`
|
||||
|
||||
func TestEnvLeak_Sweep_ExtractsFindings(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
_, _ = w.Write([]byte(envLeakFixture))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &EnvLeakSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: envLeakTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least one finding")
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:dotenv" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "high" {
|
||||
t.Errorf("unexpected Confidence: %s", f.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvLeak_Sweep_NoFindings_OnCleanFile(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/plain")
|
||||
_, _ = w.Write([]byte(envLeakCleanFixture))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &EnvLeakSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: envLeakTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var count int
|
||||
for range out {
|
||||
count++
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("expected 0 findings, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvLeak_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(envLeakFixture))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &EnvLeakSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: envLeakTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvLeak_EnabledAlwaysTrue(t *testing.T) {
|
||||
s := &EnvLeakSource{}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvLeak_NameAndRate(t *testing.T) {
|
||||
s := &EnvLeakSource{}
|
||||
if s.Name() != "dotenv" {
|
||||
t.Errorf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if s.Burst() != 2 {
|
||||
t.Errorf("burst: %d", s.Burst())
|
||||
}
|
||||
if !s.RespectsRobots() {
|
||||
t.Error("expected RespectsRobots=true")
|
||||
}
|
||||
}
|
||||
@@ -550,16 +550,9 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
|
||||
// helm
|
||||
eng.Register(&HelmSource{BaseURL: srv.URL + "/helm", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
|
||||
// --- Phase 14: Web archive sources ---
|
||||
|
||||
// wayback
|
||||
eng.Register(&WaybackMachineSource{BaseURL: srv.URL + "/wayback", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
// commoncrawl
|
||||
eng.Register(&CommonCrawlSource{BaseURL: srv.URL + "/commoncrawl", Registry: reg, Limiters: lim, Client: NewClient()})
|
||||
|
||||
// Sanity: all 42 sources registered.
|
||||
if n := len(eng.List()); n != 42 {
|
||||
t.Fatalf("expected 42 sources on engine, got %d: %v", n, eng.List())
|
||||
// Sanity: all 40 sources registered.
|
||||
if n := len(eng.List()); n != 40 {
|
||||
t.Fatalf("expected 40 sources on engine, got %d: %v", n, eng.List())
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
@@ -648,8 +641,8 @@ func TestRegisterAll_Phase12(t *testing.T) {
|
||||
})
|
||||
|
||||
names := eng.List()
|
||||
if n := len(names); n != 42 {
|
||||
t.Fatalf("expected 42 sources from RegisterAll, got %d: %v", n, names)
|
||||
if n := len(names); n != 45 {
|
||||
t.Fatalf("expected 45 sources from RegisterAll, got %d: %v", n, names)
|
||||
}
|
||||
|
||||
// Build lookup for source access.
|
||||
|
||||
@@ -57,8 +57,8 @@ type SourcesConfig struct {
|
||||
|
||||
// RegisterAll registers every Phase 10 code-hosting, Phase 11 search engine /
|
||||
// paste site, Phase 12 IoT scanner / cloud storage, Phase 13 package
|
||||
// registry / container / IaC, and Phase 14 web archive source on engine
|
||||
// (42 sources total).
|
||||
// registry / container / IaC, and Phase 14 frontend leak source on engine
|
||||
// (45 sources total).
|
||||
//
|
||||
// All sources are registered unconditionally so that cmd/recon.go can surface
|
||||
// the full catalog via `keyhunter recon list` regardless of which credentials
|
||||
@@ -230,7 +230,10 @@ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
|
||||
engine.Register(&TerraformSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&HelmSource{Registry: reg, Limiters: lim})
|
||||
|
||||
// Phase 14: Web archive sources (credentialless).
|
||||
engine.Register(&WaybackMachineSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&CommonCrawlSource{Registry: reg, Limiters: lim})
|
||||
// Phase 14: Frontend leak sources (credentialless).
|
||||
engine.Register(&SourceMapSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&WebpackSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&EnvLeakSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&SwaggerSource{Registry: reg, Limiters: lim})
|
||||
engine.Register(&DeployPreviewSource{Registry: reg, Limiters: lim})
|
||||
}
|
||||
|
||||
@@ -16,9 +16,9 @@ func registerTestRegistry() *providers.Registry {
|
||||
})
|
||||
}
|
||||
|
||||
// TestRegisterAll_WiresAllFortyTwoSources asserts that RegisterAll registers
|
||||
// every Phase 10 + Phase 11 + Phase 12 + Phase 13 + Phase 14 source by its stable name on a fresh engine.
|
||||
func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
|
||||
// TestRegisterAll_WiresAllFortyFiveSources asserts that RegisterAll registers
|
||||
// every Phase 10-14 source by its stable name on a fresh engine.
|
||||
func TestRegisterAll_WiresAllFortyFiveSources(t *testing.T) {
|
||||
eng := recon.NewEngine()
|
||||
cfg := SourcesConfig{
|
||||
Registry: registerTestRegistry(),
|
||||
@@ -36,9 +36,10 @@ func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
|
||||
"censys",
|
||||
"codeberg",
|
||||
"codesandbox",
|
||||
"commoncrawl",
|
||||
"crates",
|
||||
"deploypreview",
|
||||
"dockerhub",
|
||||
"dotenv",
|
||||
"duckduckgo",
|
||||
"fofa",
|
||||
"gcs",
|
||||
@@ -65,9 +66,11 @@ func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
|
||||
"s3",
|
||||
"sandboxes",
|
||||
"shodan",
|
||||
"sourcemaps",
|
||||
"spaces",
|
||||
"swagger",
|
||||
"terraform",
|
||||
"wayback",
|
||||
"webpack",
|
||||
"yandex",
|
||||
"zoomeye",
|
||||
}
|
||||
@@ -87,8 +90,8 @@ func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) {
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
})
|
||||
|
||||
if n := len(eng.List()); n != 42 {
|
||||
t.Fatalf("expected 42 sources registered, got %d: %v", n, eng.List())
|
||||
if n := len(eng.List()); n != 45 {
|
||||
t.Fatalf("expected 45 sources registered, got %d: %v", n, eng.List())
|
||||
}
|
||||
|
||||
// SweepAll with an empty config should filter out cred-gated sources
|
||||
|
||||
123
pkg/recon/sources/sourcemap.go
Normal file
123
pkg/recon/sources/sourcemap.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// SourceMapSource probes for publicly accessible JavaScript source maps (.map
|
||||
// files) that contain original source code. Developers frequently ship source
|
||||
// maps to production, exposing server-side secrets embedded during bundling.
|
||||
type SourceMapSource struct {
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*SourceMapSource)(nil)
|
||||
|
||||
func (s *SourceMapSource) Name() string { return "sourcemaps" }
|
||||
func (s *SourceMapSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
|
||||
func (s *SourceMapSource) Burst() int { return 2 }
|
||||
func (s *SourceMapSource) RespectsRobots() bool { return true }
|
||||
func (s *SourceMapSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// sourceMapResponse represents the top-level JSON of a .map file.
|
||||
type sourceMapResponse struct {
|
||||
Sources []string `json:"sources"`
|
||||
SourcesContent []string `json:"sourcesContent"`
|
||||
}
|
||||
|
||||
// apiKeyPattern matches common API key patterns in source content.
|
||||
var apiKeyPattern = regexp.MustCompile(`(?i)(api[_-]?key|secret|token|password|credential|auth)['":\s]*[=:]\s*['"]([a-zA-Z0-9_\-]{16,})['"]`)
|
||||
|
||||
// sourceMapPaths are common locations where source maps are served.
|
||||
var sourceMapPaths = []string{
|
||||
"/static/js/main.js.map",
|
||||
"/static/js/bundle.js.map",
|
||||
"/assets/index.js.map",
|
||||
"/dist/bundle.js.map",
|
||||
"/main.js.map",
|
||||
"/app.js.map",
|
||||
"/_next/static/chunks/main.js.map",
|
||||
}
|
||||
|
||||
func (s *SourceMapSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "sourcemaps")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Each query is used as a domain/URL hint; probe common map paths.
|
||||
for _, path := range sourceMapPaths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
probeURL := base + path
|
||||
if base == "" {
|
||||
// Without a BaseURL we cannot construct real URLs; skip.
|
||||
continue
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
continue // 404s and other errors are expected during probing
|
||||
}
|
||||
|
||||
var mapData sourceMapResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&mapData); err != nil {
|
||||
_ = resp.Body.Close()
|
||||
continue
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
|
||||
// Scan sourcesContent for API key patterns.
|
||||
for _, content := range mapData.SourcesContent {
|
||||
if apiKeyPattern.MatchString(content) {
|
||||
out <- recon.Finding{
|
||||
ProviderName: q,
|
||||
Source: probeURL,
|
||||
SourceType: "recon:sourcemaps",
|
||||
Confidence: "medium",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
break // one finding per map file is sufficient
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
143
pkg/recon/sources/sourcemap_test.go
Normal file
143
pkg/recon/sources/sourcemap_test.go
Normal file
@@ -0,0 +1,143 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func sourceMapTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
const sourceMapFixtureJSON = `{
|
||||
"version": 3,
|
||||
"sources": ["src/api/client.ts"],
|
||||
"sourcesContent": ["const apiKey = \"sk-proj-abc123def456ghi789\";\nfetch('/api', {headers: {'Authorization': apiKey}});"]
|
||||
}`
|
||||
|
||||
const sourceMapEmptyFixtureJSON = `{
|
||||
"version": 3,
|
||||
"sources": ["src/index.ts"],
|
||||
"sourcesContent": ["console.log('hello world');"]
|
||||
}`
|
||||
|
||||
func TestSourceMap_Sweep_ExtractsFindings(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(sourceMapFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SourceMapSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: sourceMapTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least one finding")
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:sourcemaps" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "medium" {
|
||||
t.Errorf("unexpected Confidence: %s", f.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceMap_Sweep_NoFindings_OnCleanContent(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(sourceMapEmptyFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SourceMapSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: sourceMapTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var count int
|
||||
for range out {
|
||||
count++
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("expected 0 findings, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceMap_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(sourceMapFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SourceMapSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: sourceMapTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceMap_EnabledAlwaysTrue(t *testing.T) {
|
||||
s := &SourceMapSource{}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSourceMap_NameAndRate(t *testing.T) {
|
||||
s := &SourceMapSource{}
|
||||
if s.Name() != "sourcemaps" {
|
||||
t.Errorf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if s.Burst() != 2 {
|
||||
t.Errorf("burst: %d", s.Burst())
|
||||
}
|
||||
if !s.RespectsRobots() {
|
||||
t.Error("expected RespectsRobots=true")
|
||||
}
|
||||
}
|
||||
118
pkg/recon/sources/swagger.go
Normal file
118
pkg/recon/sources/swagger.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// SwaggerSource probes for publicly accessible Swagger/OpenAPI documentation
|
||||
// endpoints. Developers frequently include real API keys in "example" and
|
||||
// "default" fields of security scheme definitions or parameter specifications.
|
||||
type SwaggerSource struct {
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*SwaggerSource)(nil)
|
||||
|
||||
func (s *SwaggerSource) Name() string { return "swagger" }
|
||||
func (s *SwaggerSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
|
||||
func (s *SwaggerSource) Burst() int { return 2 }
|
||||
func (s *SwaggerSource) RespectsRobots() bool { return true }
|
||||
func (s *SwaggerSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// swaggerDocPaths are common locations for Swagger/OpenAPI documentation.
|
||||
var swaggerDocPaths = []string{
|
||||
"/swagger.json",
|
||||
"/openapi.json",
|
||||
"/api-docs",
|
||||
"/v2/api-docs",
|
||||
"/swagger/v1/swagger.json",
|
||||
"/docs/openapi.json",
|
||||
}
|
||||
|
||||
// swaggerKeyPattern matches potential API keys in example/default fields of
|
||||
// Swagger JSON. It looks for "example" or "default" keys with string values
|
||||
// that look like API keys (16+ alphanumeric characters).
|
||||
var swaggerKeyPattern = regexp.MustCompile(`"(?:example|default)"\s*:\s*"([a-zA-Z0-9_\-]{16,})"`)
|
||||
|
||||
func (s *SwaggerSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "swagger")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, path := range swaggerDocPaths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
probeURL := base + path
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Try to parse as JSON to verify it's a valid Swagger doc.
|
||||
var doc map[string]interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&doc); err != nil {
|
||||
_ = resp.Body.Close()
|
||||
continue
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
|
||||
// Re-marshal to search for example/default fields with key patterns.
|
||||
raw, err := json.Marshal(doc)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if swaggerKeyPattern.Match(raw) {
|
||||
out <- recon.Finding{
|
||||
ProviderName: q,
|
||||
Source: probeURL,
|
||||
SourceType: "recon:swagger",
|
||||
Confidence: "medium",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
179
pkg/recon/sources/swagger_test.go
Normal file
179
pkg/recon/sources/swagger_test.go
Normal file
@@ -0,0 +1,179 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func swaggerTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
const swaggerFixtureJSON = `{
|
||||
"openapi": "3.0.0",
|
||||
"info": {"title": "My API", "version": "1.0"},
|
||||
"paths": {
|
||||
"/api/data": {
|
||||
"get": {
|
||||
"parameters": [
|
||||
{
|
||||
"name": "X-API-Key",
|
||||
"in": "header",
|
||||
"schema": {"type": "string"},
|
||||
"example": "sk-proj-abc123def456ghi789jkl"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"apiKey": {
|
||||
"type": "apiKey",
|
||||
"in": "header",
|
||||
"name": "Authorization",
|
||||
"default": "Bearer sk-live-xxxxxxxxxxxxxxxxxxxx"
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
const swaggerCleanFixtureJSON = `{
|
||||
"openapi": "3.0.0",
|
||||
"info": {"title": "My API", "version": "1.0"},
|
||||
"paths": {
|
||||
"/api/data": {
|
||||
"get": {
|
||||
"parameters": [
|
||||
{
|
||||
"name": "limit",
|
||||
"in": "query",
|
||||
"schema": {"type": "integer"},
|
||||
"example": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}`
|
||||
|
||||
func TestSwagger_Sweep_ExtractsFindings(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(swaggerFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SwaggerSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: swaggerTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least one finding")
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:swagger" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "medium" {
|
||||
t.Errorf("unexpected Confidence: %s", f.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSwagger_Sweep_NoFindings_OnCleanDoc(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(swaggerCleanFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SwaggerSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: swaggerTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var count int
|
||||
for range out {
|
||||
count++
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("expected 0 findings, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSwagger_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(swaggerFixtureJSON))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &SwaggerSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: swaggerTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSwagger_EnabledAlwaysTrue(t *testing.T) {
|
||||
s := &SwaggerSource{}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSwagger_NameAndRate(t *testing.T) {
|
||||
s := &SwaggerSource{}
|
||||
if s.Name() != "swagger" {
|
||||
t.Errorf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if s.Burst() != 2 {
|
||||
t.Errorf("burst: %d", s.Burst())
|
||||
}
|
||||
if !s.RespectsRobots() {
|
||||
t.Error("expected RespectsRobots=true")
|
||||
}
|
||||
}
|
||||
109
pkg/recon/sources/webpack.go
Normal file
109
pkg/recon/sources/webpack.go
Normal file
@@ -0,0 +1,109 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// WebpackSource probes for Webpack/Vite build artifacts that contain inlined
|
||||
// environment variables. Bundlers like Webpack and Vite inline process.env.*
|
||||
// values at build time, frequently shipping API keys to production bundles.
|
||||
type WebpackSource struct {
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*WebpackSource)(nil)
|
||||
|
||||
func (s *WebpackSource) Name() string { return "webpack" }
|
||||
func (s *WebpackSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
|
||||
func (s *WebpackSource) Burst() int { return 2 }
|
||||
func (s *WebpackSource) RespectsRobots() bool { return true }
|
||||
func (s *WebpackSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// envVarPattern matches inlined environment variable patterns from bundlers.
|
||||
var envVarPattern = regexp.MustCompile(`(?i)(NEXT_PUBLIC_|REACT_APP_|VITE_|VUE_APP_|NUXT_|GATSBY_)[A-Z_]*(API[_]?KEY|SECRET|TOKEN|PASSWORD)['":\s]*[=:,]\s*['"]([a-zA-Z0-9_\-]{8,})['"]`)
|
||||
|
||||
// webpackBundlePaths are common locations for JS bundle artifacts.
|
||||
var webpackBundlePaths = []string{
|
||||
"/static/js/main.js",
|
||||
"/static/js/bundle.js",
|
||||
"/_next/static/chunks/main.js",
|
||||
"/assets/index.js",
|
||||
"/dist/bundle.js",
|
||||
"/build/static/js/main.js",
|
||||
}
|
||||
|
||||
func (s *WebpackSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
return nil
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "webpack")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, path := range webpackBundlePaths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
probeURL := fmt.Sprintf("%s%s", base, path)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 512*1024)) // 512KB max
|
||||
_ = resp.Body.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if envVarPattern.Match(body) {
|
||||
out <- recon.Finding{
|
||||
ProviderName: q,
|
||||
Source: probeURL,
|
||||
SourceType: "recon:webpack",
|
||||
Confidence: "medium",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
break // one finding per query is sufficient
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
146
pkg/recon/sources/webpack_test.go
Normal file
146
pkg/recon/sources/webpack_test.go
Normal file
@@ -0,0 +1,146 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func webpackTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
const webpackFixtureJS = `
|
||||
!function(e){var t={};function n(r){if(t[r])return t[r].exports}
|
||||
var config = {
|
||||
NEXT_PUBLIC_API_KEY: "sk-proj-abc123def456ghi789jkl",
|
||||
REACT_APP_SECRET: "super-secret-value-12345678"
|
||||
};
|
||||
module.exports = config;
|
||||
`
|
||||
|
||||
const webpackCleanJS = `
|
||||
!function(e){var t={};function n(r){if(t[r])return t[r].exports}
|
||||
console.log("clean bundle");
|
||||
module.exports = {};
|
||||
`
|
||||
|
||||
func TestWebpack_Sweep_ExtractsFindings(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/javascript")
|
||||
_, _ = w.Write([]byte(webpackFixtureJS))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &WebpackSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: webpackTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least one finding")
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:webpack" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "medium" {
|
||||
t.Errorf("unexpected Confidence: %s", f.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebpack_Sweep_NoFindings_OnCleanBundle(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/javascript")
|
||||
_, _ = w.Write([]byte(webpackCleanJS))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &WebpackSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: webpackTestRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 64)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var count int
|
||||
for range out {
|
||||
count++
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("expected 0 findings, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebpack_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
_, _ = w.Write([]byte(webpackFixtureJS))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
src := &WebpackSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: webpackTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebpack_EnabledAlwaysTrue(t *testing.T) {
|
||||
s := &WebpackSource{}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebpack_NameAndRate(t *testing.T) {
|
||||
s := &WebpackSource{}
|
||||
if s.Name() != "webpack" {
|
||||
t.Errorf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if s.Burst() != 2 {
|
||||
t.Errorf("burst: %d", s.Burst())
|
||||
}
|
||||
if !s.RespectsRobots() {
|
||||
t.Error("expected RespectsRobots=true")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user