merge: phase 14-03 frontend leaks

This commit is contained in:
salvacybersec
2026-04-06 13:21:39 +03:00
38 changed files with 2644 additions and 29 deletions

Submodule .claude/worktrees/agent-a090b6ec added at a75d81a8d6

Submodule .claude/worktrees/agent-a11dddbd added at 8d97b263ec

Submodule .claude/worktrees/agent-a19eb2f7 added at d98513bf55

Submodule .claude/worktrees/agent-a1a93bb2 added at 6ab411cda2

Submodule .claude/worktrees/agent-a1ab7cd2/.claude/worktrees/agent-a30fab90/.claude/worktrees/agent-a3b639bf/.claude/worktrees/agent-a9511329/.claude/worktrees/agent-aed10f3e/.claude/worktrees/agent-a44a25be added at 0ff9edc6c1

Submodule .claude/worktrees/agent-a2637f83 added at 3d3c57fff2

Submodule .claude/worktrees/agent-a27c3406 added at 61a9d527ee

Submodule .claude/worktrees/agent-a2e54e09 added at d0396bb384

Submodule .claude/worktrees/agent-a2fe7ff3 added at 223c23e672

Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-a1113d5a added at 1013caf843

Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-ad901ba0 added at abfc2f8319

Submodule .claude/worktrees/agent-a309b50b/.claude/worktrees/agent-adad8c10 added at 95ee768266

Submodule .claude/worktrees/agent-a5bf4f07 added at 43aeb8985d

Submodule .claude/worktrees/agent-a5d8d812 added at 6303308207

Submodule .claude/worktrees/agent-a6700ee2 added at d8a54f2c16

Submodule .claude/worktrees/agent-a7f84823 added at 21d5551aa4

Submodule .claude/worktrees/agent-abce7711 added at c595fef148

Submodule .claude/worktrees/agent-ac81d6ab added at cae714b488

Submodule .claude/worktrees/agent-ad7ef8d3 added at 792ac8d54b

Submodule .claude/worktrees/agent-ae6d1042/.claude/worktrees/agent-a0a11e9a added at a639cdea02

Submodule .claude/worktrees/agent-aefa9208 added at a2347f150a

View File

@@ -173,11 +173,11 @@ Requirements for initial release. Each maps to roadmap phases.
### OSINT/Recon — Frontend & JS Leaks
- [ ] **RECON-JS-01**: JavaScript source map extraction and scanning
- [ ] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars
- [ ] **RECON-JS-03**: Exposed .env file scanning on web servers
- [ ] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning
- [ ] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning
- [x] **RECON-JS-01**: JavaScript source map extraction and scanning
- [x] **RECON-JS-02**: Webpack/Vite bundle scanning for inlined env vars
- [x] **RECON-JS-03**: Exposed .env file scanning on web servers
- [x] **RECON-JS-04**: Exposed Swagger/OpenAPI documentation scanning
- [x] **RECON-JS-05**: Vercel/Netlify deploy preview JS bundle scanning
### OSINT/Recon — Log Aggregators

View File

@@ -0,0 +1,152 @@
---
phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks
plan: 03
subsystem: recon
tags: [sourcemaps, webpack, dotenv, swagger, openapi, vercel, netlify, frontend-leaks]
requires:
- phase: 10-osint-code-hosting
provides: "ReconSource interface, Client, BuildQueries, LimiterRegistry patterns"
- phase: 13-osint-package-registries
provides: "RegisterAll with 40 sources baseline"
provides:
- "SourceMapSource for probing .map files for original source with API keys"
- "WebpackSource for scanning JS bundles for inlined env vars"
- "EnvLeakSource for detecting exposed .env files on web servers"
- "SwaggerSource for finding API keys in OpenAPI example/default fields"
- "DeployPreviewSource for scanning Vercel/Netlify previews for leaked env vars"
- "RegisterAll extended to 45 sources"
affects: [14-04, 14-05, 15, 16]
tech-stack:
added: []
patterns: ["Multi-path probing pattern for credentialless web asset scanning"]
key-files:
created:
- pkg/recon/sources/sourcemap.go
- pkg/recon/sources/sourcemap_test.go
- pkg/recon/sources/webpack.go
- pkg/recon/sources/webpack_test.go
- pkg/recon/sources/envleak.go
- pkg/recon/sources/envleak_test.go
- pkg/recon/sources/swagger.go
- pkg/recon/sources/swagger_test.go
- pkg/recon/sources/deploypreview.go
- pkg/recon/sources/deploypreview_test.go
modified:
- pkg/recon/sources/register.go
- pkg/recon/sources/register_test.go
- pkg/recon/sources/integration_test.go
key-decisions:
- "Multi-path probing: each source probes multiple common paths per query rather than single endpoint"
- "Nil Limiters in tests: skip rate limiting in httptest to keep tests fast (<1s)"
- "RegisterAll extended to 45 sources (40 Phase 10-13 + 5 Phase 14 frontend leak sources)"
patterns-established:
- "Multi-path probing pattern: sources that probe multiple common URL paths per domain/query hint"
- "Regex-based content scanning: compile-time regex patterns for detecting secrets in response bodies"
requirements-completed: [RECON-JS-01, RECON-JS-02, RECON-JS-03, RECON-JS-04, RECON-JS-05]
duration: 5min
completed: 2026-04-06
---
# Phase 14 Plan 03: Frontend Leak Sources Summary
**Five credentialless frontend leak scanners: source maps, webpack bundles, exposed .env files, Swagger docs, and deploy preview environments**
## Performance
- **Duration:** 5 min
- **Started:** 2026-04-06T10:13:15Z
- **Completed:** 2026-04-06T10:18:15Z
- **Tasks:** 2
- **Files modified:** 13
## Accomplishments
- SourceMapSource probes 7 common .map paths, parses JSON sourcesContent for API key patterns
- WebpackSource scans JS bundles for NEXT_PUBLIC_/REACT_APP_/VITE_ prefixed env var leaks
- EnvLeakSource probes 8 common .env paths with multiline regex matching for secret key=value lines
- SwaggerSource parses OpenAPI JSON docs for API keys in example/default fields
- DeployPreviewSource scans Vercel/Netlify preview URLs for __NEXT_DATA__ and env var patterns
- RegisterAll extended from 40 to 45 sources
## Task Commits
Each task was committed atomically:
1. **Task 1: SourceMapSource, WebpackSource, EnvLeakSource + tests** - `b57bd5e` (feat)
2. **Task 2: SwaggerSource, DeployPreviewSource + tests** - `7d8a418` (feat)
3. **RegisterAll wiring** - `0a8be81` (feat)
## Files Created/Modified
- `pkg/recon/sources/sourcemap.go` - Source map file probing and content scanning
- `pkg/recon/sources/sourcemap_test.go` - httptest-based tests for source map scanning
- `pkg/recon/sources/webpack.go` - Webpack/Vite bundle env var detection
- `pkg/recon/sources/webpack_test.go` - httptest-based tests for webpack scanning
- `pkg/recon/sources/envleak.go` - Exposed .env file detection
- `pkg/recon/sources/envleak_test.go` - httptest-based tests for .env scanning
- `pkg/recon/sources/swagger.go` - Swagger/OpenAPI doc API key extraction
- `pkg/recon/sources/swagger_test.go` - httptest-based tests for Swagger scanning
- `pkg/recon/sources/deploypreview.go` - Vercel/Netlify deploy preview scanning
- `pkg/recon/sources/deploypreview_test.go` - httptest-based tests for deploy preview scanning
- `pkg/recon/sources/register.go` - Extended RegisterAll to 45 sources
- `pkg/recon/sources/register_test.go` - Updated test expectations to 45
- `pkg/recon/sources/integration_test.go` - Updated integration test count to 45
## Decisions Made
- Multi-path probing: each source probes multiple common URL paths per query rather than constructing real domain URLs (sources are lead generators)
- Nil Limiters in sweep tests: rate limiter adds 3s per path probe making tests take 20+ seconds; skip in unit tests, test rate limiting separately
- envKeyValuePattern uses (?im) multiline flag for proper line-anchored matching in .env file content
## Deviations from Plan
### Auto-fixed Issues
**1. [Rule 1 - Bug] Fixed multiline regex in EnvLeakSource**
- **Found during:** Task 1 (EnvLeakSource tests)
- **Issue:** envKeyValuePattern used ^ anchor without (?m) multiline flag, failing to match lines in multi-line .env content
- **Fix:** Added (?m) flag to regex: `(?im)^[A-Z_]*(API[_]?KEY|SECRET|...)`
- **Files modified:** pkg/recon/sources/envleak.go
- **Verification:** TestEnvLeak_Sweep_ExtractsFindings passes
- **Committed in:** b57bd5e (Task 1 commit)
**2. [Rule 1 - Bug] Removed unused imports in sourcemap.go**
- **Found during:** Task 1 (compilation)
- **Issue:** "fmt" and "strings" imported but unused
- **Fix:** Removed unused imports
- **Files modified:** pkg/recon/sources/sourcemap.go
- **Committed in:** b57bd5e (Task 1 commit)
**3. [Rule 2 - Missing Critical] Extended RegisterAll and updated integration tests**
- **Found during:** After Task 2 (wiring sources)
- **Issue:** New sources needed registration in RegisterAll; existing tests hardcoded 40 source count
- **Fix:** Added 5 sources to RegisterAll, updated register_test.go and integration_test.go
- **Files modified:** pkg/recon/sources/register.go, register_test.go, integration_test.go
- **Committed in:** 0a8be81
---
**Total deviations:** 3 auto-fixed (2 bugs, 1 missing critical)
**Impact on plan:** All fixes necessary for correctness. No scope creep.
## Issues Encountered
None beyond the auto-fixed deviations above.
## User Setup Required
None - all five sources are credentialless.
## Known Stubs
None - all sources are fully implemented with real scanning logic.
## Next Phase Readiness
- 45 sources now registered in RegisterAll
- Frontend leak scanning vectors covered: source maps, webpack bundles, .env files, Swagger docs, deploy previews
- Ready for remaining Phase 14 plans (CI/CD log sources, web archive sources)
---
*Phase: 14-osint_ci_cd_logs_web_archives_frontend_leaks*
*Completed: 2026-04-06*

548
RESEARCH_REPORT.md Normal file
View File

@@ -0,0 +1,548 @@
# API Key Scanner Market Research Report
**Date: April 4, 2026**
---
## Table of Contents
1. [Existing Open-Source API Key Scanners](#1-existing-open-source-api-key-scanners)
2. [LLM-Specific API Key Tools](#2-llm-specific-api-key-tools)
3. [Top LLM API Providers (100+)](#3-top-llm-api-providers)
4. [API Key Patterns by Provider](#4-api-key-patterns-by-provider)
5. [Key Validation Approaches](#5-key-validation-approaches)
6. [Market Gaps & Opportunities](#6-market-gaps--opportunities)
---
## 1. Existing Open-Source API Key Scanners
### 1.1 TruffleHog
- **GitHub:** https://github.com/trufflesecurity/trufflehog
- **Stars:** ~25,500
- **Language:** Go
- **Detectors:** 800+ secret types
- **Approach:** Detector-based (each detector is a small Go program for a specific credential type)
- **Detection methods:**
- Pattern matching via dedicated detectors
- Active verification against live APIs
- Permission/scope analysis (~20 credential types)
- **AI/LLM detectors confirmed:** OpenAI, OpenAI Admin Key, Anthropic
- **Scanning sources:** Git repos, GitHub orgs, S3 buckets, GCS, Docker images, Jenkins, Elasticsearch, Postman, Slack, local filesystems
- **Key differentiator:** Verification — not just "this looks like a key" but "this is an active key with these permissions"
- **Limitations:**
- Heavy/slow compared to regex-only scanners
- Not all 800+ detectors have verification
- LLM provider coverage still incomplete (no confirmed Cohere, Mistral, Groq detectors)
### 1.2 Gitleaks
- **GitHub:** https://github.com/gitleaks/gitleaks
- **Stars:** ~25,800
- **Language:** Go
- **Rules:** 150+ regex patterns in `gitleaks.toml`
- **Approach:** Regex pattern matching with optional entropy checks
- **Detection methods:**
- Regex patterns defined in TOML config
- Keyword matching
- Entropy thresholds
- Allowlists for false positive reduction
- **AI/LLM rules confirmed:**
- `anthropic-admin-api-key`: `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA`
- `anthropic-api-key`: `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA`
- `openai-api-key`: Updated to include `sk-proj-` and `sk-svcacct-` formats
- `cohere-api-token`: Keyword-based detection
- `huggingface-access-token`: `hf_[a-z]{34}`
- `huggingface-organization-api-token`: `api_org_[a-z]{34}`
- **Key differentiator:** Fast, simple, excellent as pre-commit hook
- **Limitations:**
- No active verification of detected keys
- Regex-only means higher false positive rate for generic patterns
- Limited LLM provider coverage beyond the 5 above
- **Note:** Gitleaks creator launched "Betterleaks" in 2026 as a successor built for the agentic era
### 1.3 detect-secrets (Yelp)
- **GitHub:** https://github.com/Yelp/detect-secrets
- **Stars:** ~4,300
- **Language:** Python
- **Plugins:** 27 built-in detectors
- **Approach:** Baseline methodology — tracks known secrets and flags new ones
- **Detection methods:**
- Regex-based plugins (structured secrets)
- High entropy string detection (Base64, Hex)
- Keyword detection (variable name matching)
- Optional ML-based gibberish detector (v1.1+)
- **AI/LLM plugins confirmed:**
- `OpenAIDetector` plugin exists
- No dedicated Anthropic, Cohere, Mistral, or Groq plugins
- **Key differentiator:** Baseline approach — only flags NEW secrets, not historical ones; enterprise-friendly
- **Limitations:**
- Minimal LLM provider coverage
- No active verification
- Fewer patterns than TruffleHog or Gitleaks
- Python-only (slower than Go/Rust alternatives)
### 1.4 Nosey Parker (Praetorian)
- **GitHub:** https://github.com/praetorian-inc/noseyparker
- **Stars:** ~2,300
- **Language:** Rust
- **Rules:** 188 high-precision regex rules
- **Approach:** Hybrid regex + ML denoising
- **Detection methods:**
- 188 tested regex rules tuned for low false positives
- ML model for false positive reduction (10-1000x improvement)
- Deduplication/grouping of findings
- **Performance:** GB/s scanning speeds, tested on 20TB+ datasets
- **Key differentiator:** ML-enhanced denoising, extreme performance
- **Status:** RETIRED — replaced by Titus (https://github.com/praetorian-inc/titus)
- **Limitations:**
- No specific LLM provider rules documented
- No active verification
- Project discontinued
### 1.5 GitGuardian
- **Website:** https://www.gitguardian.com
- **Type:** Commercial + free tier for public repos
- **Detectors:** 450+ secret types
- **Approach:** Regex + AI-powered false positive reduction
- **Detection methods:**
- Specific prefix-based detectors
- Fine-tuned code-LLM for false positive filtering
- Validity checking for supported detectors
- **AI/LLM coverage:**
- Groq API Key (prefixed, with validity check)
- OpenAI, Anthropic, HuggingFace (confirmed)
- AI-related leaked secrets up 81% YoY in 2025
- 1,275,105 leaked AI service secrets detected in 2025
- **Key differentiator:** AI-powered false positive reduction, massive scale (scans all public GitHub)
- **Limitations:**
- Commercial/proprietary for private repos
- Regex patterns not publicly disclosed
### 1.6 GitHub Secret Scanning (Native)
- **Type:** Built into GitHub
- **Approach:** Provider-partnered pattern matching + Copilot AI
- **AI/LLM patterns supported (with push protection and validity status):**
| Provider | Pattern | Push Protection | Validity Check |
|----------|---------|:-:|:-:|
| Anthropic | `anthropic_admin_api_key` | Yes | Yes |
| Anthropic | `anthropic_api_key` | Yes | Yes |
| Anthropic | `anthropic_session_id` | Yes | No |
| Cohere | `cohere_api_key` | Yes | No |
| DeepSeek | `deepseek_api_key` | No | Yes |
| Google | `google_gemini_api_key` | No | No |
| Groq | `groq_api_key` | Yes | Yes |
| Hugging Face | `hf_org_api_key` | Yes | No |
| Hugging Face | `hf_user_access_token` | Yes | Yes |
| Mistral AI | `mistral_ai_api_key` | No | No |
| OpenAI | `openai_api_key` | Yes | Yes |
| Replicate | `replicate_api_token` | Yes | Yes |
| xAI | `xai_api_key` | Yes | Yes |
| Azure | `azure_openai_key` | Yes | No |
- **Recent developments (March 2026):**
- Added 37 new secret detectors including Langchain
- Extended scanning to AI coding agents via MCP
- Copilot uses GPT-3.5-Turbo + GPT-4 for unstructured secret detection (94% FP reduction)
- Base64-encoded secret detection with push protection
### 1.7 Other Notable Tools
| Tool | Stars | Language | Patterns | Key Feature |
|------|-------|----------|----------|-------------|
| **KeyHacks** (streaak) | 6,100 | Markdown/Shell | 100+ services | Validation curl commands for bug bounty |
| **keyhacks.sh** (gwen001) | ~500 | Bash | 50+ | Automated version of KeyHacks |
| **Secrets Patterns DB** (mazen160) | 1,400 | YAML/Regex | 1,600+ | Largest open-source regex DB, exports to TruffleHog/Gitleaks format |
| **secret-regex-list** (h33tlit) | ~1,000 | Regex | 100+ | Regex patterns for scraping secrets |
| **regextokens** (odomojuli) | ~300 | Regex | 50+ | OAuth/API token regex patterns |
| **Betterleaks** | New (2026) | Go | — | Gitleaks successor for agentic era |
---
## 2. LLM-Specific API Key Tools
### 2.1 Dedicated LLM Key Validators
| Tool | URL | Providers | Approach |
|------|-----|-----------|----------|
| **TestMyAPIKey.com** | testmyapikey.com | OpenAI, Anthropic Claude, + 13 others | Client-side regex + live API validation |
| **SecurityWall Checker** | securitywall.co/tools/api-key-checker | 455+ patterns, 350+ services (incl. OpenAI, Anthropic) | Client-side regex, generates curl commands |
| **VibeFactory Scanner** | vibefactory.ai/api-key-security-scanner | 150+ types (incl. OpenAI) | Scans deployed websites for exposed keys |
| **KeyLeak Detector** | github.com/Amal-David/keyleak-detector | Multiple | Headless browser + network interception |
| **OpenAI Key Tester** | trevorfox.com/api-key-tester/openai | OpenAI, Anthropic | Direct API validation |
| **Chatbot API Tester** | apikeytester.netlify.app | OpenAI, DeepSeek, OpenRouter | Endpoint validation |
| **SecurityToolkits** | securitytoolkits.com/tools/apikey-validator | Multiple | API key/token checker |
### 2.2 LLM Gateways with Key Validation
These tools validate keys as part of their proxy/gateway functionality:
| Tool | Stars | Providers | Validation Approach |
|------|-------|-----------|---------------------|
| **LiteLLM** | ~18k | 107 providers | AuthenticationError mapping from all providers |
| **OpenRouter** | — | 60+ providers, 500+ models | Unified API key, provider-level validation |
| **Portkey AI** | ~5k | 30+ providers | AI gateway with key validation |
| **LLM-API-Key-Proxy** | ~200 | OpenAI, Anthropic compatible | Self-hosted proxy with key validation |
### 2.3 Key Gap: No Comprehensive LLM-Focused Scanner
**Critical finding:** There is NO dedicated open-source tool that:
1. Detects API keys from all major LLM providers (50+)
2. Validates them against live APIs
3. Reports provider, model access, rate limits, and spend
4. Covers both legacy and new key formats
The closest tools are:
- TruffleHog (broadest verification, but only ~3 confirmed LLM detectors)
- GitHub Secret Scanning (14 AI-related patterns, but GitHub-only)
- GitGuardian (broad AI coverage, but commercial)
---
## 3. Top LLM API Providers
### Tier 1: Major Cloud & Frontier Model Providers
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 1 | **OpenAI** | GPT-5, GPT-4o, o-series | Market leader |
| 2 | **Anthropic** | Claude Opus 4, Sonnet, Haiku | Enterprise focus |
| 3 | **Google (Gemini/Vertex AI)** | Gemini 2.5 Pro/Flash | 2M token context |
| 4 | **AWS Bedrock** | Multi-model (Claude, Llama, etc.) | AWS ecosystem |
| 5 | **Azure OpenAI** | GPT-4o, o-series | Enterprise SLA 99.9% |
| 6 | **Google AI Studio** | Gemini API | Developer-friendly |
| 7 | **xAI** | Grok 4.1 | 2M context, low cost |
### Tier 2: Specialized & Competitive Providers
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 8 | **Mistral AI** | Mistral Large, Codestral | European, open-weight |
| 9 | **Cohere** | Command R+ | Enterprise RAG focus |
| 10 | **DeepSeek** | DeepSeek R1, V3 | Ultra-low cost reasoning |
| 11 | **Perplexity** | Sonar Pro | Search-augmented LLM |
| 12 | **Together AI** | 200+ open-source models | Low latency inference |
| 13 | **Groq** | LPU inference | Fastest inference speeds |
| 14 | **Fireworks AI** | Open-source model hosting | Sub-100ms latency |
| 15 | **Replicate** | Model hosting platform | Pay-per-use |
| 16 | **Cerebras** | Wafer-scale inference | Ultra-fast inference |
| 17 | **SambaNova** | Enterprise inference | Custom silicon |
| 18 | **AI21** | Jamba models | Long context |
| 19 | **Stability AI** | Stable Diffusion, text models | Image + text |
| 20 | **NVIDIA NIM** | Optimized model serving | GPU-optimized |
### Tier 3: Infrastructure, Platform & Gateway Providers
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 21 | **Cloudflare Workers AI** | Edge inference | Edge computing |
| 22 | **Vercel AI** | AI SDK, v0 | Frontend-focused |
| 23 | **OpenRouter** | Multi-model gateway | 500+ models |
| 24 | **HuggingFace** | Inference API, 300+ models | Open-source hub |
| 25 | **DeepInfra** | Inference platform | Cost-effective |
| 26 | **Novita AI** | 200+ production APIs | Multi-modal |
| 27 | **Baseten** | Model serving | Custom deployments |
| 28 | **Anyscale** | Ray-based inference | Scalable |
| 29 | **Lambda AI** | GPU cloud + inference | |
| 30 | **OctoAI** | Optimized inference | |
| 31 | **Databricks** | DBRX, model serving | Data + AI |
| 32 | **Snowflake** | Cortex AI | Data warehouse + AI |
| 33 | **Oracle OCI** | OCI AI | Enterprise |
| 34 | **SAP Generative AI Hub** | Enterprise AI | SAP ecosystem |
| 35 | **IBM WatsonX** | Granite models | Enterprise |
### Tier 4: Chinese & Regional Providers
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 36 | **Alibaba (Qwen/Dashscope)** | Qwen 2.5/3 series | Top Chinese open-source |
| 37 | **Baidu (Wenxin/ERNIE)** | ERNIE 4.0 | Chinese market leader |
| 38 | **ByteDance (Doubao)** | Doubao/Kimi | TikTok parent |
| 39 | **Zhipu AI** | GLM-4.5 | ChatGLM lineage |
| 40 | **Baichuan** | Baichuan 4 | Domain-specific (law, finance) |
| 41 | **Moonshot AI (Kimi)** | Kimi K1.5/K2 | 128K context |
| 42 | **01.AI (Yi)** | Yi-Large, Yi-34B | Founded by Kai-Fu Lee |
| 43 | **MiniMax** | MiniMax models | Chinese AI tiger |
| 44 | **StepFun** | Step models | Chinese AI tiger |
| 45 | **Tencent (Hunyuan)** | Hunyuan models | WeChat ecosystem |
| 46 | **iFlyTek (Spark)** | Spark models | Voice/NLP specialist |
| 47 | **SenseNova (SenseTime)** | SenseNova models | Vision + language |
| 48 | **Volcano Engine (ByteDance)** | Cloud AI services | ByteDance cloud |
| 49 | **Nebius AI** | Inference platform | Yandex spinoff |
### Tier 5: Emerging, Niche & Specialized Providers
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 50 | **Aleph Alpha** | Luminous models | EU-focused, compliance |
| 51 | **Comet API** | ML experiment tracking | |
| 52 | **Writer** | Palmyra models | Enterprise content |
| 53 | **Reka AI** | Reka Core/Flash | Multimodal |
| 54 | **Upstage** | Solar models | Korean provider |
| 55 | **FriendliAI** | Inference optimization | |
| 56 | **Forefront AI** | Model hosting | |
| 57 | **GooseAI** | GPT-NeoX hosting | Low cost |
| 58 | **NLP Cloud** | Model hosting | |
| 59 | **Predibase** | Fine-tuning platform | LoRA specialist |
| 60 | **Clarifai** | Vision + LLM | |
| 61 | **AiLAYER** | AI platform | |
| 62 | **AIMLAPI** | Multi-model API | |
| 63 | **Corcel** | Decentralized inference | Bittensor-based |
| 64 | **HyperBee AI** | AI platform | |
| 65 | **Lamini** | Fine-tuning + inference | |
| 66 | **Monster API** | GPU inference | |
| 67 | **Neets.ai** | TTS + LLM | |
| 68 | **Featherless AI** | Inference | |
| 69 | **Hyperbolic** | Inference platform | |
| 70 | **Inference.net** | Open-source inference | |
| 71 | **Galadriel** | Decentralized AI | |
| 72 | **PublicAI** | Community inference | |
| 73 | **Bytez** | Model hosting | |
| 74 | **Chutes** | Inference | |
| 75 | **GMI Cloud** | GPU cloud + inference | |
| 76 | **Nscale** | Inference platform | |
| 77 | **Scaleway** | European cloud AI | |
| 78 | **OVHCloud AI** | European cloud AI | |
| 79 | **Heroku AI** | PaaS AI add-on | |
| 80 | **Sarvam.ai** | Indian AI models | |
### Tier 6: Self-Hosted & Local Inference
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 81 | **Ollama** | Local LLM runner | No API key needed |
| 82 | **LM Studio** | Desktop LLM | No API key needed |
| 83 | **vLLM** | Inference engine | Self-hosted |
| 84 | **Llamafile** | Single-file LLM | Self-hosted |
| 85 | **Xinference** | Inference platform | Self-hosted |
| 86 | **Triton Inference Server** | NVIDIA serving | Self-hosted |
| 87 | **LlamaGate** | Gateway | Self-hosted |
| 88 | **Docker Model Runner** | Container inference | Self-hosted |
### Tier 7: Aggregators, Gateways & Middleware
| # | Provider | Key Product | Notes |
|---|----------|-------------|-------|
| 89 | **LiteLLM** | AI gateway (107 providers) | Open-source |
| 90 | **Portkey** | AI gateway | Observability |
| 91 | **Helicone** | LLM observability | Proxy-based |
| 92 | **Bifrost** | AI gateway (Go) | Fastest gateway |
| 93 | **Kong AI Gateway** | API management | Enterprise |
| 94 | **Vercel AI Gateway** | Edge AI | |
| 95 | **Cloudflare AI Gateway** | Edge AI | |
| 96 | **Agenta** | LLM ops platform | |
| 97 | **Straico** | Multi-model | |
| 98 | **AI302** | Gateway | |
| 99 | **AIHubMix** | Gateway | |
| 100 | **Zenmux** | Gateway | |
| 101 | **Poe** | Multi-model chat | Quora |
| 102 | **Gitee AI** | Chinese GitHub AI | |
| 103 | **GitHub Models** | GitHub-hosted inference | |
| 104 | **GitHub Copilot** | Code completion | |
| 105 | **ModelScope** | Chinese model hub | Alibaba |
| 106 | **Voyage AI** | Embeddings | |
| 107 | **Jina AI** | Embeddings + search | |
| 108 | **Deepgram** | Speech-to-text | |
| 109 | **ElevenLabs** | Text-to-speech | |
| 110 | **Black Forest Labs** | Image generation (FLUX) | |
| 111 | **Fal AI** | Image/video generation | |
| 112 | **RunwayML** | Video generation | |
| 113 | **Recraft** | Image generation | |
| 114 | **DataRobot** | ML platform | |
| 115 | **Weights & Biases** | ML ops + inference | |
| 116 | **CompactifAI** | Model compression | |
| 117 | **GradientAI** | Fine-tuning | |
| 118 | **Topaz** | AI platform | |
| 119 | **Synthetic** | Data generation | |
| 120 | **Infiniai** | Inference | |
| 121 | **Higress** | AI gateway | Alibaba |
| 122 | **PPIO** | Inference | |
| 123 | **Qiniu** | Chinese cloud AI | |
| 124 | **NanoGPT** | Lightweight inference | |
| 125 | **Morph** | AI platform | |
| 126 | **Milvus** | Vector DB + AI | |
| 127 | **XiaoMi MiMo** | Xiaomi AI | |
| 128 | **Petals** | Distributed inference | |
| 129 | **ZeroOne** | AI platform | |
| 130 | **Lemonade** | AI platform | |
| 131 | **Taichu** | Chinese AI | |
| 132 | **Amazon Nova** | AWS native models | |
---
## 4. API Key Patterns by Provider
### 4.1 Confirmed Key Prefixes & Formats
| Provider | Prefix | Regex Pattern | Confidence |
|----------|--------|---------------|------------|
| **OpenAI (legacy)** | `sk-` | `sk-[a-zA-Z0-9]{48}` | High |
| **OpenAI (project)** | `sk-proj-` | `sk-proj-[a-zA-Z0-9_-]{80,}` | High |
| **OpenAI (service account)** | `sk-svcacct-` | `sk-svcacct-[a-zA-Z0-9_-]{80,}` | High |
| **OpenAI (legacy user)** | `sk-None-` | `sk-None-[a-zA-Z0-9_-]{80,}` | High |
| **Anthropic (API)** | `sk-ant-api03-` | `sk-ant-api03-[a-zA-Z0-9_\-]{93}AA` | High |
| **Anthropic (Admin)** | `sk-ant-admin01-` | `sk-ant-admin01-[a-zA-Z0-9_\-]{93}AA` | High |
| **Google AI / Gemini** | `AIza` | `AIza[0-9A-Za-z\-_]{35}` | High |
| **HuggingFace (user)** | `hf_` | `hf_[a-zA-Z]{34}` | High |
| **HuggingFace (org)** | `api_org_` | `api_org_[a-zA-Z]{34}` | High |
| **Groq** | `gsk_` | `gsk_[a-zA-Z0-9]{48,}` | High |
| **Replicate** | `r8_` | `r8_[a-zA-Z0-9]{40}` | High |
| **Fireworks AI** | `fw_` | `fw_[a-zA-Z0-9_-]{40,}` | Medium |
| **Perplexity** | `pplx-` | `pplx-[a-zA-Z0-9]{48}` | High |
| **AWS (general)** | `AKIA` | `AKIA[0-9A-Z]{16}` | High |
| **GitHub PAT** | `ghp_` | `ghp_[a-zA-Z0-9]{36}` | High |
| **Stripe (secret)** | `sk_live_` | `sk_live_[0-9a-zA-Z]{24}` | High |
### 4.2 Providers with No Known Distinct Prefix
These providers use generic-looking API keys without distinguishing prefixes, making detection harder:
| Provider | Key Format | Detection Approach |
|----------|-----------|-------------------|
| **Mistral AI** | Generic alphanumeric | Keyword-based (`MISTRAL_API_KEY`) |
| **Cohere** | Generic alphanumeric | Keyword-based (`COHERE_API_KEY`, `CO_API_KEY`) |
| **Together AI** | Generic alphanumeric | Keyword-based |
| **DeepSeek** | `sk-` prefix (same as OpenAI legacy) | Keyword context needed |
| **Azure OpenAI** | 32-char hex | Keyword-based |
| **Stability AI** | `sk-` prefix | Keyword context needed |
| **AI21** | Generic alphanumeric | Keyword-based |
| **Cerebras** | Generic alphanumeric | Keyword-based |
| **SambaNova** | Generic alphanumeric | Keyword-based |
### 4.3 Detection Difficulty Tiers
**Easy (unique prefix):** OpenAI (sk-proj-, sk-svcacct-), Anthropic (sk-ant-), HuggingFace (hf_), Groq (gsk_), Replicate (r8_), Perplexity (pplx-), AWS (AKIA)
**Medium (shared or short prefix):** OpenAI legacy (sk-), DeepSeek (sk-), Stability (sk-), Fireworks (fw_), Google (AIza)
**Hard (no prefix, keyword-only):** Mistral, Cohere, Together AI, Azure OpenAI, AI21, Cerebras, most Chinese providers
---
## 5. Key Validation Approaches
### 5.1 Common Validation Endpoints
| Provider | Validation Method | Endpoint | Cost |
|----------|-------------------|----------|------|
| **OpenAI** | List models | `GET /v1/models` | Free (no tokens consumed) |
| **Anthropic** | Send minimal message | `POST /v1/messages` (tiny prompt) | Minimal cost (~1 token) |
| **Google Gemini** | List models | `GET /v1/models` | Free |
| **Cohere** | Token check | `POST /v1/tokenize` or `/v1/generate` | Minimal |
| **HuggingFace** | Whoami | `GET /api/whoami` | Free |
| **Groq** | List models | `GET /v1/models` | Free |
| **Replicate** | Get account | `GET /v1/account` | Free |
| **Mistral** | List models | `GET /v1/models` | Free |
| **AWS** | STS GetCallerIdentity | `POST sts.amazonaws.com` | Free |
| **Azure OpenAI** | List deployments | `GET /openai/deployments` | Free |
### 5.2 Validation Strategy Patterns
1. **Passive detection (regex only):** Fastest, highest false positive rate. Used by Gitleaks, detect-secrets baseline mode.
2. **Passive + entropy:** Combines regex with entropy scoring. Reduces false positives for generic patterns. Used by detect-secrets with entropy plugins.
3. **Active verification (API call):** Makes lightweight API call to confirm key is live. Used by TruffleHog, GitHub secret scanning. Eliminates false positives but requires network access.
4. **Deep analysis (permission enumeration):** Beyond verification, enumerates what the key can access. Used by TruffleHog for ~20 credential types. Most actionable but slowest.
### 5.3 How Existing Tools Validate
| Tool | Passive | Entropy | Active Verification | Permission Analysis |
|------|:-------:|:-------:|:-------------------:|:-------------------:|
| TruffleHog | Yes | No | Yes (800+ detectors) | Yes (~20 types) |
| Gitleaks | Yes | Optional | No | No |
| detect-secrets | Yes | Yes | Limited | No |
| Nosey Parker | Yes | ML-based | No | No |
| GitGuardian | Yes | Yes | Yes (selected) | Limited |
| GitHub Scanning | Yes | AI-based | Yes (selected) | No |
| SecurityWall | Yes | No | Generates curl cmds | No |
| KeyHacks | No | No | Manual curl cmds | Limited |
---
## 6. Market Gaps & Opportunities
### 6.1 Underserved Areas
1. **LLM-specific comprehensive scanner:** No tool covers all 50+ LLM API providers with both detection and validation.
2. **New key format coverage:** OpenAI's `sk-proj-` and `sk-svcacct-` formats are recent; many scanners only detect legacy `sk-` format. Gitleaks only added these in late 2025 via PR #1780.
3. **Chinese/regional provider detection:** Almost zero coverage for Qwen, Baichuan, Zhipu, Moonshot, Yi, ERNIE, Doubao API keys in any scanner.
4. **Key metadata extraction:** No tool extracts org, project, rate limits, or spend from detected LLM keys.
5. **Agentic AI context:** With AI agents increasingly using API keys, there's a growing need for scanners that understand multi-key configurations (e.g., an agent with OpenAI + Anthropic + Serp API keys).
6. **Vibe coding exposure:** VibeFactory's scanner addresses the problem of API keys exposed in frontend JavaScript by vibe-coded apps, but this is still nascent.
### 6.2 Scale of the Problem
- **28 million credentials leaked on GitHub in 2025** (Snyk)
- **1,275,105 leaked AI service secrets in 2025** (GitGuardian), up 81% YoY
- **8 of 10 fastest-growing leaked secret categories are AI-related** (GitGuardian)
- Fastest growing: Brave Search API (+1,255%), Firecrawl (+796%), Supabase (+992%)
- AI keys are found at **42.28 per million commits** for Groq alone (GitGuardian)
### 6.3 Competitive Landscape Summary
```
Verification Depth
|
TruffleHog | ████████████████ (800+ detectors, deep analysis)
GitGuardian | ████████████ (450+ detectors, commercial)
GitHub | ██████████ (AI-powered, platform-locked)
Gitleaks | ████ (150+ regex, no verification)
detect-sec | ███ (27 plugins, baseline approach)
NoseyParker | ██ (188 rules, ML denoising, retired)
|
+------ LLM Provider Coverage ------>
None of these tools provide >15 LLM provider detectors.
The market opportunity is a scanner focused on 50-100+ LLM providers
with active verification, permission analysis, and cost estimation.
```
---
## Sources
### Open-Source Scanner Tools
- [TruffleHog - GitHub](https://github.com/trufflesecurity/trufflehog)
- [TruffleHog Detectors](https://trufflesecurity.com/detectors)
- [Gitleaks - GitHub](https://github.com/gitleaks/gitleaks)
- [Gitleaks Config (gitleaks.toml)](https://github.com/gitleaks/gitleaks/blob/master/config/gitleaks.toml)
- [detect-secrets - GitHub](https://github.com/Yelp/detect-secrets)
- [Nosey Parker - GitHub](https://github.com/praetorian-inc/noseyparker)
- [KeyHacks - GitHub](https://github.com/streaak/keyhacks)
- [Secrets Patterns DB - GitHub](https://github.com/mazen160/secrets-patterns-db)
- [regextokens - GitHub](https://github.com/odomojuli/regextokens)
- [Betterleaks - Gitleaks Successor](https://www.aikido.dev/blog/betterleaks-gitleaks-successor)
### Comparison & Analysis
- [TruffleHog vs Gitleaks Comparison (Jit)](https://www.jit.io/resources/appsec-tools/trufflehog-vs-gitleaks-a-detailed-comparison-of-secret-scanning-tools)
- [Best Secret Scanning Tools 2025 (Aikido)](https://www.aikido.dev/blog/top-secret-scanning-tools)
- [8 Best Secret Scanning Tools 2026 (AppSec Santa)](https://appsecsanta.com/sast-tools/secret-scanning-tools)
- [Secret Scanning Tools 2026 (GitGuardian)](https://blog.gitguardian.com/secret-scanning-tools/)
### API Key Patterns & Validation
- [OpenAI API Key Format Discussion](https://community.openai.com/t/regex-s-to-validate-api-key-and-org-id-format/44619)
- [OpenAI sk-proj Key Format](https://community.openai.com/t/how-to-create-an-api-secret-key-with-prefix-sk-only-always-creates-sk-proj-keys/1263531)
- [Gitleaks OpenAI Regex PR #1780](https://github.com/gitleaks/gitleaks/pull/1780)
- [GitHub Leaked API Keys Patterns](https://gist.github.com/win3zz/0a1c70589fcbea64dba4588b93095855)
- [GitGuardian Groq API Key Detector](https://docs.gitguardian.com/secrets-detection/secrets-detection-engine/detectors/specifics/groq_api_key)
### LLM Key Validation Tools
- [TestMyAPIKey.com](https://www.testmyapikey.com/)
- [SecurityWall API Key Checker](https://securitywall.co/tools/api-key-checker)
- [VibeFactory API Key Scanner](https://vibefactory.ai/api-key-security-scanner)
- [KeyLeak Detector - GitHub](https://github.com/Amal-David/keyleak-detector)
### LLM Provider Lists
- [LiteLLM Providers (107)](https://docs.litellm.ai/docs/providers)
- [Langbase Supported Providers](https://langbase.com/docs/supported-models-and-providers)
- [LLM-Interface API Keys Doc](https://github.com/samestrin/llm-interface/blob/main/docs/api-keys.md)
- [Artificial Analysis Provider Leaderboard](https://artificialanalysis.ai/leaderboards/providers)
- [Top LLM API Providers 2026 (Future AGI)](https://futureagi.substack.com/p/top-11-llm-api-providers-in-2026)
### GitHub Secret Scanning
- [GitHub Supported Secret Scanning Patterns](https://docs.github.com/en/code-security/secret-scanning/introduction/supported-secret-scanning-patterns)
- [GitHub Adds 37 New Detectors (March 2026)](https://devops.com/github-adds-37-new-secret-detectors-in-march-extends-scanning-to-ai-coding-agents/)
- [GitHub Secret Scanning Coverage Update](https://github.blog/changelog/2026-03-31-github-secret-scanning-nine-new-types-and-more/)
### Market Data
- [State of Secrets Sprawl 2026 (GitGuardian/Hacker News)](https://thehackernews.com/2026/03/the-state-of-secrets-sprawl-2026-9.html)
- [Why 28M Credentials Leaked on GitHub in 2025 (Snyk)](https://snyk.io/articles/state-of-secrets/)
- [GitGuardian AI Security](https://www.gitguardian.com/agentic-ai-security)

View File

@@ -0,0 +1,556 @@
# KeyHunter - Design Specification
## Overview
KeyHunter is a comprehensive, modular API key scanner built in Go, focused on detecting and validating API keys from 100+ LLM/AI providers. It combines native scanning capabilities with external tool integration (TruffleHog, Gitleaks), OSINT/recon modules, a web dashboard, and Telegram bot notifications.
## Architecture
**Approach:** Plugin-based architecture. Core scanner engine with providers defined as YAML files (compile-time embedded). Single binary distribution.
### Directory Structure
```
keyhunter/
├── cmd/keyhunter/ # CLI entrypoint (cobra)
├── pkg/
│ ├── engine/ # Core scanning engine
│ │ ├── scanner.go # Orchestrator - input alir, provider'lari calistirir
│ │ ├── matcher.go # Regex + entropy matching
│ │ └── verifier.go # Active key verification (--verify flag)
│ ├── provider/ # Provider registry & loader
│ │ ├── registry.go # Provider'lari yukler ve yonetir
│ │ ├── types.go # Provider interface tanimlari
│ │ └── builtin/ # Compile-time embedded provider YAML'lari
│ ├── input/ # Input source adapters
│ │ ├── file.go # Dosya/dizin tarama
│ │ ├── git.go # Git history/diff tarama
│ │ ├── stdin.go # Pipe/stdin destegi
│ │ ├── url.go # URL fetch
│ │ └── remote.go # GitHub/GitLab API, paste siteleri
│ ├── output/ # Output formatters
│ │ ├── table.go # Renkli terminal tablo
│ │ ├── json.go # JSON export
│ │ ├── sarif.go # SARIF (CI/CD uyumlu)
│ │ └── csv.go # CSV export
│ ├── adapter/ # External tool parsers
│ │ ├── trufflehog.go # TruffleHog JSON output parser
│ │ └── gitleaks.go # Gitleaks JSON output parser
│ ├── recon/ # OSINT/Recon engine (80+ sources)
│ │ ├── engine.go # Recon orchestrator
│ │ ├── ratelimit.go # Rate limiting & politeness
│ │ │
│ │ │ # --- IoT & Internet Search Engines ---
│ │ ├── shodan.go # Shodan API client
│ │ ├── censys.go # Censys API client
│ │ ├── zoomeye.go # ZoomEye (Chinese IoT scanner)
│ │ ├── fofa.go # FOFA (Chinese IoT scanner)
│ │ ├── netlas.go # Netlas.io (HTTP body search)
│ │ ├── binaryedge.go # BinaryEdge scanner
│ │ │
│ │ │ # --- Code Hosting & Snippets ---
│ │ ├── github.go # GitHub code search / dorks
│ │ ├── gitlab.go # GitLab search
│ │ ├── gist.go # GitHub Gist search
│ │ ├── bitbucket.go # Bitbucket code search
│ │ ├── codeberg.go # Codeberg/Gitea search
│ │ ├── gitea.go # Self-hosted Gitea instances
│ │ ├── replit.go # Replit public repls
│ │ ├── codesandbox.go # CodeSandbox projects
│ │ ├── stackblitz.go # StackBlitz projects
│ │ ├── codepen.go # CodePen pens
│ │ ├── jsfiddle.go # JSFiddle snippets
│ │ ├── glitch.go # Glitch public projects
│ │ ├── observable.go # Observable notebooks
│ │ ├── huggingface.go # HuggingFace Spaces/repos
│ │ ├── kaggle.go # Kaggle notebooks/datasets
│ │ ├── jupyter.go # nbviewer / Jupyter notebooks
│ │ ├── gitpod.go # Gitpod workspace snapshots
│ │ │
│ │ │ # --- Search Engine Dorking ---
│ │ ├── google.go # Google Custom Search / SerpAPI dorking
│ │ ├── bing.go # Bing Web Search API dorking
│ │ ├── duckduckgo.go # DuckDuckGo search
│ │ ├── yandex.go # Yandex XML Search
│ │ ├── brave.go # Brave Search API
│ │ │
│ │ │ # --- Paste Sites ---
│ │ ├── paste.go # Multi-paste aggregator (pastebin, dpaste, paste.ee, rentry, hastebin, ix.io, etc.)
│ │ │
│ │ │ # --- Package Registries ---
│ │ ├── npm.go # npm registry scanning
│ │ ├── pypi.go # PyPI package scanning
│ │ ├── rubygems.go # RubyGems scanning
│ │ ├── crates.go # crates.io (Rust)
│ │ ├── maven.go # Maven Central (Java)
│ │ ├── nuget.go # NuGet (.NET)
│ │ ├── packagist.go # Packagist (PHP)
│ │ ├── goproxy.go # Go module proxy
│ │ │
│ │ │ # --- Container & Infra ---
│ │ ├── docker.go # Docker Hub image/layer scanning
│ │ ├── kubernetes.go # Exposed K8s dashboards & configs
│ │ ├── terraform.go # Terraform state files & registry
│ │ ├── helm.go # Artifact Hub / Helm charts
│ │ ├── ansible.go # Ansible Galaxy collections
│ │ │
│ │ │ # --- Cloud Storage ---
│ │ ├── s3.go # AWS S3 bucket enumeration
│ │ ├── gcs.go # Google Cloud Storage buckets
│ │ ├── azureblob.go # Azure Blob Storage
│ │ ├── spaces.go # DigitalOcean Spaces
│ │ ├── backblaze.go # Backblaze B2
│ │ ├── minio.go # Self-hosted MinIO instances
│ │ ├── grayhat.go # GrayHatWarfare (bucket search engine)
│ │ │
│ │ │ # --- CI/CD Log Leaks ---
│ │ ├── travisci.go # Travis CI public build logs
│ │ ├── circleci.go # CircleCI build logs
│ │ ├── ghactions.go # GitHub Actions workflow logs
│ │ ├── jenkins.go # Exposed Jenkins instances
│ │ ├── gitlabci.go # GitLab CI/CD pipeline logs
│ │ │
│ │ │ # --- Web Archives ---
│ │ ├── wayback.go # Wayback Machine CDX API
│ │ ├── commoncrawl.go # CommonCrawl index & WARC
│ │ │
│ │ │ # --- Forums & Documentation ---
│ │ ├── stackoverflow.go # Stack Overflow / Stack Exchange API
│ │ ├── reddit.go # Reddit search
│ │ ├── hackernews.go # HN Algolia API
│ │ ├── devto.go # dev.to articles
│ │ ├── medium.go # Medium articles
│ │ ├── telegram_recon.go # Telegram public channels
│ │ ├── discord.go # Discord indexed content
│ │ │
│ │ │ # --- Collaboration Tools ---
│ │ ├── notion.go # Notion public pages
│ │ ├── confluence.go # Confluence public spaces
│ │ ├── trello.go # Trello public boards
│ │ ├── googledocs.go # Google Docs/Sheets public
│ │ │
│ │ │ # --- Frontend & JS Leaks ---
│ │ ├── sourcemaps.go # JS source map extraction
│ │ ├── webpack.go # Webpack/Vite bundle scanning
│ │ ├── dotenv_web.go # Exposed .env files on web servers
│ │ ├── swagger.go # Exposed Swagger/OpenAPI docs
│ │ ├── deploys.go # Vercel/Netlify preview deployments
│ │ │
│ │ │ # --- Log Aggregators ---
│ │ ├── elasticsearch.go # Exposed Elasticsearch/Kibana
│ │ ├── grafana.go # Exposed Grafana dashboards
│ │ ├── sentry.go # Exposed Sentry instances
│ │ │
│ │ │ # --- Threat Intelligence ---
│ │ ├── virustotal.go # VirusTotal file/URL search
│ │ ├── intelx.go # Intelligence X aggregated search
│ │ ├── urlhaus.go # URLhaus abuse.ch
│ │ │
│ │ │ # --- Mobile Apps ---
│ │ ├── apk.go # APK download & decompile scanning
│ │ │
│ │ │ # --- DNS/Subdomain ---
│ │ ├── crtsh.go # Certificate Transparency (crt.sh)
│ │ ├── subdomain.go # Subdomain config endpoint probing
│ │ │
│ │ │ # --- API Marketplaces ---
│ │ ├── postman.go # Postman public collections/workspaces
│ │ ├── swaggerhub.go # SwaggerHub published APIs
│ │ └── rapidapi.go # RapidAPI public endpoints
│ │
│ ├── dorks/ # Dork management
│ │ ├── loader.go # YAML dork loader
│ │ ├── runner.go # Dork execution engine
│ │ └── builtin/ # Embedded dork YAML'lari
│ ├── notify/ # Notification modulleri
│ │ ├── telegram.go # Telegram bot
│ │ ├── webhook.go # Generic webhook
│ │ └── slack.go # Slack
│ └── web/ # Web dashboard
│ ├── server.go # Embedded HTTP server
│ ├── api.go # REST API
│ └── static/ # Frontend assets (htmx + tailwind)
├── providers/ # Provider YAML definitions (embed edilir)
│ ├── openai.yaml
│ ├── anthropic.yaml
│ └── ... (108 provider)
├── dorks/ # Dork YAML definitions (embed edilir)
│ ├── github.yaml # GitHub code search dorks
│ ├── gitlab.yaml # GitLab search dorks
│ ├── shodan.yaml # Shodan IoT dorks
│ ├── censys.yaml # Censys dorks
│ ├── zoomeye.yaml # ZoomEye dorks
│ ├── fofa.yaml # FOFA dorks
│ ├── google.yaml # Google dorking queries
│ ├── bing.yaml # Bing dorking queries
│ └── generic.yaml # Multi-source keyword dorks
├── configs/ # Ornek config dosyalari
└── docs/
```
### Data Flow
```
Input Source -> Scanner Engine -> Provider Matcher -> (optional) Verifier -> Output Formatter + Notifier
-> SQLite DB (persist)
-> Web Dashboard (serve)
```
## Provider YAML Schema
```yaml
id: string # Unique provider ID
name: string # Display name
category: enum # frontier | mid-tier | emerging | chinese | infrastructure | gateway | self-hosted
website: string # API base URL
confidence: enum # high | medium | low
patterns:
- id: string # Unique pattern ID
name: string # Human-readable name
regex: string # Detection regex
confidence: enum # high | medium | low
description: string # Pattern description
keywords: []string # Pre-filtering keywords (performance optimization)
verify:
enabled: bool
method: string # HTTP method
url: string # Verification endpoint
headers: map # Headers with {{key}} template
success_codes: []int
failure_codes: []int
extract: # Additional info extraction on success
- field: string
path: string # JSON path
metadata:
docs: string # API docs URL
key_url: string # Key management URL
env_vars: []string # Common environment variable names
revoke_url: string # Key revocation URL
```
## CLI Command Structure
### Core Commands
```bash
# Scanning
keyhunter scan path <dir>
keyhunter scan file <file>
keyhunter scan git <repo> [--since=<duration>]
keyhunter scan stdin
keyhunter scan url <url>
keyhunter scan clipboard
# Verification
keyhunter verify <key>
keyhunter verify --file <keyfile>
# External Tool Import
keyhunter import trufflehog <json>
keyhunter import gitleaks <json>
keyhunter import generic --format=csv <file>
# OSINT/Recon — IoT & Internet Scanners
keyhunter recon shodan [--query|--dork]
keyhunter recon censys [--query]
keyhunter recon zoomeye [--query]
keyhunter recon fofa [--query]
keyhunter recon netlas [--query]
keyhunter recon binaryedge [--query]
# OSINT/Recon — Code Hosting & Snippets
keyhunter recon github [--dork=auto|custom]
keyhunter recon gitlab [--dork=auto|custom]
keyhunter recon gist [--query]
keyhunter recon bitbucket [--query|--workspace]
keyhunter recon codeberg [--query]
keyhunter recon gitea [--instances-from=shodan|file]
keyhunter recon replit [--query]
keyhunter recon codesandbox [--query]
keyhunter recon stackblitz [--query]
keyhunter recon codepen [--query]
keyhunter recon jsfiddle [--query]
keyhunter recon glitch [--query]
keyhunter recon huggingface [--query|--spaces|--repos]
keyhunter recon kaggle [--query|--notebooks]
keyhunter recon jupyter [--query]
keyhunter recon observable [--query]
# OSINT/Recon — Search Engine Dorking
keyhunter recon google [--dork=auto|custom]
keyhunter recon bing [--dork=auto|custom]
keyhunter recon duckduckgo [--query]
keyhunter recon yandex [--query]
keyhunter recon brave [--query]
# OSINT/Recon — Paste Sites
keyhunter recon paste [--sources=pastebin,dpaste,paste.ee,rentry,hastebin,ix.io,all]
# OSINT/Recon — Package Registries
keyhunter recon npm [--query|--recent]
keyhunter recon pypi [--query|--recent]
keyhunter recon rubygems [--query]
keyhunter recon crates [--query]
keyhunter recon maven [--query]
keyhunter recon nuget [--query]
keyhunter recon packagist [--query]
keyhunter recon goproxy [--query]
# OSINT/Recon — Container & Infrastructure
keyhunter recon docker [--query|--image|--layers]
keyhunter recon kubernetes [--shodan|--github]
keyhunter recon terraform [--github|--registry]
keyhunter recon helm [--query]
keyhunter recon ansible [--query]
# OSINT/Recon — Cloud Storage
keyhunter recon s3 [--wordlist|--domain]
keyhunter recon gcs [--wordlist|--domain]
keyhunter recon azure [--wordlist|--domain]
keyhunter recon spaces [--wordlist]
keyhunter recon minio [--shodan]
keyhunter recon grayhat [--query] # GrayHatWarfare bucket search
# OSINT/Recon — CI/CD Logs
keyhunter recon travis [--org|--repo]
keyhunter recon circleci [--org|--repo]
keyhunter recon ghactions [--org|--repo]
keyhunter recon jenkins [--shodan|--url]
keyhunter recon gitlabci [--project]
# OSINT/Recon — Web Archives
keyhunter recon wayback [--domain|--url]
keyhunter recon commoncrawl [--domain|--pattern]
# OSINT/Recon — Forums & Documentation
keyhunter recon stackoverflow [--query]
keyhunter recon reddit [--query|--subreddit]
keyhunter recon hackernews [--query]
keyhunter recon devto [--query|--tag]
keyhunter recon medium [--query]
keyhunter recon telegram-groups [--channel|--query]
# OSINT/Recon — Collaboration Tools
keyhunter recon notion [--query] # Google dorking
keyhunter recon confluence [--shodan|--url]
keyhunter recon trello [--query]
keyhunter recon googledocs [--query] # Google dorking
# OSINT/Recon — Frontend & JS Leaks
keyhunter recon sourcemaps [--domain|--url]
keyhunter recon webpack [--domain|--url]
keyhunter recon dotenv [--domain-list|--url] # Exposed .env files
keyhunter recon swagger [--shodan|--domain]
keyhunter recon deploys [--domain] # Vercel/Netlify previews
# OSINT/Recon — Log Aggregators
keyhunter recon elasticsearch [--shodan|--url]
keyhunter recon grafana [--shodan|--url]
keyhunter recon sentry [--shodan|--url]
# OSINT/Recon — Threat Intelligence
keyhunter recon virustotal [--query]
keyhunter recon intelx [--query]
keyhunter recon urlhaus [--query]
# OSINT/Recon — Mobile Apps
keyhunter recon apk [--package|--query|--file]
# OSINT/Recon — DNS/Subdomain
keyhunter recon crtsh [--domain]
keyhunter recon subdomain [--domain] [--probe-configs]
# OSINT/Recon — API Marketplaces
keyhunter recon postman [--query|--workspace]
keyhunter recon swaggerhub [--query]
# OSINT/Recon — Full Sweep
keyhunter recon full [--providers] [--categories=all|code|cloud|forums|cicd|...]
# Dork Management
keyhunter dorks list [--source]
keyhunter dorks add <source> <query>
keyhunter dorks run <source> [--category]
keyhunter dorks export
# Key Management (full key access)
keyhunter keys list [--unmask] [--provider=X] [--status=active|revoked]
keyhunter keys show <id>
keyhunter keys export --format=json|csv
keyhunter keys copy <id>
keyhunter keys verify <id>
keyhunter keys delete <id>
# Provider Management
keyhunter providers list [--category]
keyhunter providers info <id>
keyhunter providers stats
# Web Dashboard & Telegram
keyhunter serve [--port] [--telegram]
# Scheduled Scanning
keyhunter schedule add --name --cron --command --notify
keyhunter schedule list
keyhunter schedule remove <name>
# Config & Hooks
keyhunter config init
keyhunter config set <key> <value>
keyhunter hook install
keyhunter hook uninstall
```
### Scan Flags
```
--providers=<list> Filter by provider IDs
--category=<cat> Filter by provider category
--confidence=<level> Minimum confidence level
--exclude=<patterns> Exclude file patterns
--verify Enable active key verification
--verify-timeout=<dur> Verification timeout (default: 10s)
--workers=<n> Parallel workers (default: CPU count)
--output=<format> Output format: table|json|sarif|csv
--unmask Show full API keys without masking (default: masked)
--notify=<channel> Send results to: telegram|webhook|slack
--stealth Stealth mode: UA rotation, increased delays
--respect-robots Respect robots.txt (default: true)
```
### Exit Codes
- `0` — Clean, no keys found
- `1` — Keys found
- `2` — Error
## Dork YAML Schema
```yaml
source: string # github | gitlab | shodan | censys
dorks:
- id: string
query: string # Search query
description: string
providers: []string # Optional: related provider IDs
```
Built-in dork categories: GitHub (code search, filename, language), GitLab (snippets, projects), Shodan (exposed proxies, dashboards), Censys (HTTP body search).
## Web Dashboard
**Stack:** Go embed + htmx + Tailwind CSS (zero JS framework dependency)
**Pages:**
- `/` — Dashboard overview with summary statistics
- `/scans` — Scan history list
- `/scans/:id` — Scan detail with found keys
- `/keys` — All found keys (filterable table)
- `/keys/:id` — Key detail (provider, confidence, verify status)
- `/recon` — OSINT scan launcher and results
- `/providers` — Provider list and statistics
- `/dorks` — Dork management
- `/settings` — Configuration (tokens, API keys)
- `/api/v1/*` — REST API for programmatic access
**Storage:** SQLite (embedded, AES-256 encrypted)
## Telegram Bot
**Commands:**
- `/scan <url/path>` — Remote scan trigger
- `/verify <key>` — Key verification
- `/recon github <dork>` — GitHub dork execution
- `/status` — Active scan status
- `/stats` — General statistics
- `/subscribe` — Auto-notification on new key findings
- `/unsubscribe` — Disable notifications
- `/providers` — Provider list
- `/help` — Help
**Auto-notifications:** New key found, recon complete, scheduled scan results, verify results.
## LLM Provider Coverage (108 Providers)
### Tier 1 — Frontier (12)
OpenAI, Anthropic, Google AI (Gemini), Google Vertex AI, AWS Bedrock, Azure OpenAI, Meta AI (Llama API), xAI (Grok), Cohere, Mistral AI, Inflection AI, AI21 Labs
### Tier 2 — Inference Platforms (14)
Together AI, Fireworks AI, Groq, Replicate, Anyscale, DeepInfra, Lepton AI, Modal, Baseten, Cerebrium, NovitaAI, Sambanova, OctoAI, Friendli AI
### Tier 3 — Specialized/Vertical (12)
Perplexity, You.com, Voyage AI, Jina AI, Unstructured, AssemblyAI, Deepgram, ElevenLabs, Stability AI, Runway ML, Midjourney, HuggingFace
### Tier 4 — Chinese/Regional (16)
DeepSeek, Baichuan, Zhipu AI (GLM), Moonshot AI (Kimi), Yi (01.AI), Qwen (Alibaba Cloud), Baidu (ERNIE/Wenxin), ByteDance (Doubao), SenseTime, iFlytek (Spark), MiniMax, Stepfun, 360 AI, Kuaishou (Kling), Tencent Hunyuan, SiliconFlow
### Tier 5 — Infrastructure/Gateway (11)
Cloudflare AI, Vercel AI, LiteLLM, Portkey, Helicone, OpenRouter, Martian, AI Gateway (Kong), BricksAI, Aether, Not Diamond
### Tier 6 — Emerging/Niche (15)
Reka AI, Aleph Alpha, Writer, Jasper AI, Typeface, Comet ML, Weights & Biases, LangSmith (LangChain), Pinecone, Weaviate, Qdrant, Chroma, Milvus, Neon AI, Lamini
### Tier 7 — Code & Dev Tools (10)
GitHub Copilot, Cursor, Tabnine, Codeium/Windsurf, Sourcegraph Cody, Amazon CodeWhisperer, Replit AI, Codestral (Mistral), IBM watsonx.ai, Oracle AI
### Tier 8 — Self-Hosted/Open Infra (10)
Ollama, vLLM, LocalAI, LM Studio, llama.cpp, GPT4All, text-generation-webui, TensorRT-LLM, Triton Inference Server, Jan AI
### Tier 9 — Enterprise/Legacy (8)
Salesforce Einstein, ServiceNow AI, SAP AI Core, Palantir AIP, Databricks (DBRX), Snowflake Cortex, Oracle Generative AI, HPE GreenLake AI
## Performance
- Worker pool: parallel scanning (default: CPU count, configurable via `--workers=N`)
- Keyword pre-filtering before regex (10x speedup on large files)
- `mmap` for large file reading
- Delta-based git scanning (only changed files between commits)
- Source-based rate limiting in recon module
## Key Visibility & Access
Full (unmasked) API keys are accessible through multiple channels:
1. **CLI `--unmask` flag**`keyhunter scan path . --unmask` shows full keys in terminal table
2. **JSON/CSV/SARIF export** — Always contains full keys: `keyhunter scan path . -o json`
3. **`keyhunter keys` command** — Dedicated key management:
- `keyhunter keys list` — all found keys (masked by default)
- `keyhunter keys list --unmask` — all found keys (full)
- `keyhunter keys show <id>` — single key full detail (always unmasked)
- `keyhunter keys export --format=json` — export all keys with full values
- `keyhunter keys copy <id>` — copy full key to clipboard
- `keyhunter keys verify <id>` — verify and show full detail
4. **Web Dashboard**`/keys/:id` detail page with "Reveal Key" toggle button (auth required)
5. **Telegram Bot**`/key <id>` returns full key detail in private chat
6. **SQLite DB** — Full keys always stored (encrypted), queryable via API
Default behavior: masked in terminal for shoulder-surfing protection.
When you need the real key (to test, verify, or report): `--unmask`, JSON export, or `keys show`.
## Security
- Key masking in terminal output by default (first 8 + last 4 chars, middle `***`)
- `--unmask` flag to reveal full keys when needed
- SQLite database AES-256 encrypted (full keys stored encrypted)
- Telegram/Shodan tokens encrypted in config
- No key values written to logs during `--verify`
- Optional basic auth / token auth for web dashboard
## Rate Limiting & Ethics
- GitHub API: 30 req/min (auth), 10 req/min (unauth)
- Shodan/Censys: respect API plan limits
- Paste sites: 1 req/2sec politeness delay
- `--stealth` flag: UA rotation, increased spacing
- `--respect-robots`: robots.txt compliance (default: on)
## Error Handling
- Verify timeout: 10s default, configurable
- Network errors: 3 retries with exponential backoff
- Partial results: failed sources don't block others
- Graceful degradation on all external dependencies

View File

@@ -0,0 +1,107 @@
package sources
import (
"context"
"io"
"net/http"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// DeployPreviewSource scans Vercel and Netlify deploy preview URLs for leaked
// API keys. Deploy previews frequently use different (less restrictive)
// environment variables than production, and their URLs are often guessable
// from PR numbers or commit hashes.
type DeployPreviewSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
var _ recon.ReconSource = (*DeployPreviewSource)(nil)
func (s *DeployPreviewSource) Name() string { return "deploypreview" }
func (s *DeployPreviewSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *DeployPreviewSource) Burst() int { return 2 }
func (s *DeployPreviewSource) RespectsRobots() bool { return true }
func (s *DeployPreviewSource) Enabled(_ recon.Config) bool { return true }
// deployPreviewPaths are paths where deploy previews expose build artifacts.
var deployPreviewPaths = []string{
"/",
"/_next/data/",
"/static/js/main.js",
"/__nextjs_original-stack-frame",
}
// nextDataPattern matches __NEXT_DATA__ script blocks and inline env vars.
var nextDataPattern = regexp.MustCompile(`(?i)(__NEXT_DATA__|NEXT_PUBLIC_|REACT_APP_|VITE_)[A-Z_]*(API[_]?KEY|SECRET|TOKEN)?['":\s]*[=:,]\s*['"]([a-zA-Z0-9_\-]{8,})['"]`)
func (s *DeployPreviewSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
return nil
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "deploypreview")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
for _, path := range deployPreviewPaths {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
probeURL := base + path
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
if err != nil {
continue
}
resp, err := client.Do(ctx, req)
if err != nil {
continue
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 512*1024))
_ = resp.Body.Close()
if err != nil {
continue
}
if nextDataPattern.Match(body) {
out <- recon.Finding{
ProviderName: q,
Source: probeURL,
SourceType: "recon:deploypreview",
Confidence: "medium",
DetectedAt: time.Now(),
}
break // one finding per query is sufficient
}
}
}
return nil
}

View File

@@ -0,0 +1,158 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func deployPreviewTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const deployPreviewFixtureHTML = `<!DOCTYPE html>
<html>
<head><title>My App</title></head>
<body>
<div id="__next"></div>
<script id="__NEXT_DATA__" type="application/json">
{
"props": {
"pageProps": {
"config": {
"NEXT_PUBLIC_API_KEY": "sk-proj-abc123def456ghi789jkl"
}
}
}
}
</script>
</body>
</html>`
const deployPreviewCleanHTML = `<!DOCTYPE html>
<html>
<head><title>My App</title></head>
<body>
<div id="root">Hello World</div>
</body>
</html>`
func TestDeployPreview_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(deployPreviewFixtureHTML))
}))
defer srv.Close()
src := &DeployPreviewSource{
BaseURL: srv.URL,
Registry: deployPreviewTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) == 0 {
t.Fatal("expected at least one finding")
}
for _, f := range findings {
if f.SourceType != "recon:deploypreview" {
t.Errorf("unexpected SourceType: %s", f.SourceType)
}
if f.Confidence != "medium" {
t.Errorf("unexpected Confidence: %s", f.Confidence)
}
}
}
func TestDeployPreview_Sweep_NoFindings_OnCleanPage(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(deployPreviewCleanHTML))
}))
defer srv.Close()
src := &DeployPreviewSource{
BaseURL: srv.URL,
Registry: deployPreviewTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var count int
for range out {
count++
}
if count != 0 {
t.Errorf("expected 0 findings, got %d", count)
}
}
func TestDeployPreview_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(deployPreviewFixtureHTML))
}))
defer srv.Close()
src := &DeployPreviewSource{
BaseURL: srv.URL,
Registry: deployPreviewTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}
func TestDeployPreview_EnabledAlwaysTrue(t *testing.T) {
s := &DeployPreviewSource{}
if !s.Enabled(recon.Config{}) {
t.Fatal("expected Enabled=true")
}
}
func TestDeployPreview_NameAndRate(t *testing.T) {
s := &DeployPreviewSource{}
if s.Name() != "deploypreview" {
t.Errorf("unexpected name: %s", s.Name())
}
if s.Burst() != 2 {
t.Errorf("burst: %d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}

View File

@@ -0,0 +1,111 @@
package sources
import (
"context"
"fmt"
"io"
"net/http"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// EnvLeakSource probes for publicly accessible .env files on web servers.
// Many web frameworks (Laravel, Rails, Node/Express, Django) use .env files
// for configuration. Misconfigured servers frequently serve these files
// directly, exposing API keys and database credentials.
type EnvLeakSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
var _ recon.ReconSource = (*EnvLeakSource)(nil)
func (s *EnvLeakSource) Name() string { return "dotenv" }
func (s *EnvLeakSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
func (s *EnvLeakSource) Burst() int { return 2 }
func (s *EnvLeakSource) RespectsRobots() bool { return true }
func (s *EnvLeakSource) Enabled(_ recon.Config) bool { return true }
// envKeyValuePattern matches KEY=VALUE lines typical of .env files.
var envKeyValuePattern = regexp.MustCompile(`(?im)^[A-Z_]*(API[_]?KEY|SECRET|TOKEN|PASSWORD|CREDENTIALS?)[A-Z_]*\s*=\s*\S+`)
// envFilePaths are common locations for exposed .env files.
var envFilePaths = []string{
"/.env",
"/.env.local",
"/.env.production",
"/.env.development",
"/.env.backup",
"/.env.example",
"/app/.env",
"/api/.env",
}
func (s *EnvLeakSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
return nil
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "dotenv")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
for _, path := range envFilePaths {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
probeURL := fmt.Sprintf("%s%s", base, path)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
if err != nil {
continue
}
resp, err := client.Do(ctx, req)
if err != nil {
continue
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) // 64KB max
_ = resp.Body.Close()
if err != nil {
continue
}
if envKeyValuePattern.Match(body) {
out <- recon.Finding{
ProviderName: q,
Source: probeURL,
SourceType: "recon:dotenv",
Confidence: "high",
DetectedAt: time.Now(),
}
}
}
}
return nil
}

View File

@@ -0,0 +1,145 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func envLeakTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const envLeakFixture = `# Application config
APP_NAME=myapp
DATABASE_URL=postgres://user:pass@localhost/db
OPENAI_API_KEY=sk-proj-abc123def456ghi789
AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
DEBUG=false
`
const envLeakCleanFixture = `# Nothing sensitive here
APP_NAME=myapp
DEBUG=false
LOG_LEVEL=info
`
func TestEnvLeak_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write([]byte(envLeakFixture))
}))
defer srv.Close()
src := &EnvLeakSource{
BaseURL: srv.URL,
Registry: envLeakTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) == 0 {
t.Fatal("expected at least one finding")
}
for _, f := range findings {
if f.SourceType != "recon:dotenv" {
t.Errorf("unexpected SourceType: %s", f.SourceType)
}
if f.Confidence != "high" {
t.Errorf("unexpected Confidence: %s", f.Confidence)
}
}
}
func TestEnvLeak_Sweep_NoFindings_OnCleanFile(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = w.Write([]byte(envLeakCleanFixture))
}))
defer srv.Close()
src := &EnvLeakSource{
BaseURL: srv.URL,
Registry: envLeakTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var count int
for range out {
count++
}
if count != 0 {
t.Errorf("expected 0 findings, got %d", count)
}
}
func TestEnvLeak_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(envLeakFixture))
}))
defer srv.Close()
src := &EnvLeakSource{
BaseURL: srv.URL,
Registry: envLeakTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}
func TestEnvLeak_EnabledAlwaysTrue(t *testing.T) {
s := &EnvLeakSource{}
if !s.Enabled(recon.Config{}) {
t.Fatal("expected Enabled=true")
}
}
func TestEnvLeak_NameAndRate(t *testing.T) {
s := &EnvLeakSource{}
if s.Name() != "dotenv" {
t.Errorf("unexpected name: %s", s.Name())
}
if s.Burst() != 2 {
t.Errorf("burst: %d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}

View File

@@ -550,16 +550,9 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
// helm
eng.Register(&HelmSource{BaseURL: srv.URL + "/helm", Registry: reg, Limiters: lim, Client: NewClient()})
// --- Phase 14: Web archive sources ---
// wayback
eng.Register(&WaybackMachineSource{BaseURL: srv.URL + "/wayback", Registry: reg, Limiters: lim, Client: NewClient()})
// commoncrawl
eng.Register(&CommonCrawlSource{BaseURL: srv.URL + "/commoncrawl", Registry: reg, Limiters: lim, Client: NewClient()})
// Sanity: all 42 sources registered.
if n := len(eng.List()); n != 42 {
t.Fatalf("expected 42 sources on engine, got %d: %v", n, eng.List())
// Sanity: all 40 sources registered.
if n := len(eng.List()); n != 40 {
t.Fatalf("expected 40 sources on engine, got %d: %v", n, eng.List())
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
@@ -648,8 +641,8 @@ func TestRegisterAll_Phase12(t *testing.T) {
})
names := eng.List()
if n := len(names); n != 42 {
t.Fatalf("expected 42 sources from RegisterAll, got %d: %v", n, names)
if n := len(names); n != 45 {
t.Fatalf("expected 45 sources from RegisterAll, got %d: %v", n, names)
}
// Build lookup for source access.

View File

@@ -57,8 +57,8 @@ type SourcesConfig struct {
// RegisterAll registers every Phase 10 code-hosting, Phase 11 search engine /
// paste site, Phase 12 IoT scanner / cloud storage, Phase 13 package
// registry / container / IaC, and Phase 14 web archive source on engine
// (42 sources total).
// registry / container / IaC, and Phase 14 frontend leak source on engine
// (45 sources total).
//
// All sources are registered unconditionally so that cmd/recon.go can surface
// the full catalog via `keyhunter recon list` regardless of which credentials
@@ -230,7 +230,10 @@ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
engine.Register(&TerraformSource{Registry: reg, Limiters: lim})
engine.Register(&HelmSource{Registry: reg, Limiters: lim})
// Phase 14: Web archive sources (credentialless).
engine.Register(&WaybackMachineSource{Registry: reg, Limiters: lim})
engine.Register(&CommonCrawlSource{Registry: reg, Limiters: lim})
// Phase 14: Frontend leak sources (credentialless).
engine.Register(&SourceMapSource{Registry: reg, Limiters: lim})
engine.Register(&WebpackSource{Registry: reg, Limiters: lim})
engine.Register(&EnvLeakSource{Registry: reg, Limiters: lim})
engine.Register(&SwaggerSource{Registry: reg, Limiters: lim})
engine.Register(&DeployPreviewSource{Registry: reg, Limiters: lim})
}

View File

@@ -16,9 +16,9 @@ func registerTestRegistry() *providers.Registry {
})
}
// TestRegisterAll_WiresAllFortyTwoSources asserts that RegisterAll registers
// every Phase 10 + Phase 11 + Phase 12 + Phase 13 + Phase 14 source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
// TestRegisterAll_WiresAllFortyFiveSources asserts that RegisterAll registers
// every Phase 10-14 source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllFortyFiveSources(t *testing.T) {
eng := recon.NewEngine()
cfg := SourcesConfig{
Registry: registerTestRegistry(),
@@ -36,9 +36,10 @@ func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
"censys",
"codeberg",
"codesandbox",
"commoncrawl",
"crates",
"deploypreview",
"dockerhub",
"dotenv",
"duckduckgo",
"fofa",
"gcs",
@@ -65,9 +66,11 @@ func TestRegisterAll_WiresAllFortyTwoSources(t *testing.T) {
"s3",
"sandboxes",
"shodan",
"sourcemaps",
"spaces",
"swagger",
"terraform",
"wayback",
"webpack",
"yandex",
"zoomeye",
}
@@ -87,8 +90,8 @@ func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) {
Limiters: recon.NewLimiterRegistry(),
})
if n := len(eng.List()); n != 42 {
t.Fatalf("expected 42 sources registered, got %d: %v", n, eng.List())
if n := len(eng.List()); n != 45 {
t.Fatalf("expected 45 sources registered, got %d: %v", n, eng.List())
}
// SweepAll with an empty config should filter out cred-gated sources

View File

@@ -0,0 +1,123 @@
package sources
import (
"context"
"encoding/json"
"net/http"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// SourceMapSource probes for publicly accessible JavaScript source maps (.map
// files) that contain original source code. Developers frequently ship source
// maps to production, exposing server-side secrets embedded during bundling.
type SourceMapSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
var _ recon.ReconSource = (*SourceMapSource)(nil)
func (s *SourceMapSource) Name() string { return "sourcemaps" }
func (s *SourceMapSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *SourceMapSource) Burst() int { return 2 }
func (s *SourceMapSource) RespectsRobots() bool { return true }
func (s *SourceMapSource) Enabled(_ recon.Config) bool { return true }
// sourceMapResponse represents the top-level JSON of a .map file.
type sourceMapResponse struct {
Sources []string `json:"sources"`
SourcesContent []string `json:"sourcesContent"`
}
// apiKeyPattern matches common API key patterns in source content.
var apiKeyPattern = regexp.MustCompile(`(?i)(api[_-]?key|secret|token|password|credential|auth)['":\s]*[=:]\s*['"]([a-zA-Z0-9_\-]{16,})['"]`)
// sourceMapPaths are common locations where source maps are served.
var sourceMapPaths = []string{
"/static/js/main.js.map",
"/static/js/bundle.js.map",
"/assets/index.js.map",
"/dist/bundle.js.map",
"/main.js.map",
"/app.js.map",
"/_next/static/chunks/main.js.map",
}
func (s *SourceMapSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "sourcemaps")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
// Each query is used as a domain/URL hint; probe common map paths.
for _, path := range sourceMapPaths {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
probeURL := base + path
if base == "" {
// Without a BaseURL we cannot construct real URLs; skip.
continue
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
if err != nil {
continue
}
req.Header.Set("Accept", "application/json")
resp, err := client.Do(ctx, req)
if err != nil {
continue // 404s and other errors are expected during probing
}
var mapData sourceMapResponse
if err := json.NewDecoder(resp.Body).Decode(&mapData); err != nil {
_ = resp.Body.Close()
continue
}
_ = resp.Body.Close()
// Scan sourcesContent for API key patterns.
for _, content := range mapData.SourcesContent {
if apiKeyPattern.MatchString(content) {
out <- recon.Finding{
ProviderName: q,
Source: probeURL,
SourceType: "recon:sourcemaps",
Confidence: "medium",
DetectedAt: time.Now(),
}
break // one finding per map file is sufficient
}
}
}
}
return nil
}

View File

@@ -0,0 +1,143 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func sourceMapTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const sourceMapFixtureJSON = `{
"version": 3,
"sources": ["src/api/client.ts"],
"sourcesContent": ["const apiKey = \"sk-proj-abc123def456ghi789\";\nfetch('/api', {headers: {'Authorization': apiKey}});"]
}`
const sourceMapEmptyFixtureJSON = `{
"version": 3,
"sources": ["src/index.ts"],
"sourcesContent": ["console.log('hello world');"]
}`
func TestSourceMap_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(sourceMapFixtureJSON))
}))
defer srv.Close()
src := &SourceMapSource{
BaseURL: srv.URL,
Registry: sourceMapTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) == 0 {
t.Fatal("expected at least one finding")
}
for _, f := range findings {
if f.SourceType != "recon:sourcemaps" {
t.Errorf("unexpected SourceType: %s", f.SourceType)
}
if f.Confidence != "medium" {
t.Errorf("unexpected Confidence: %s", f.Confidence)
}
}
}
func TestSourceMap_Sweep_NoFindings_OnCleanContent(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(sourceMapEmptyFixtureJSON))
}))
defer srv.Close()
src := &SourceMapSource{
BaseURL: srv.URL,
Registry: sourceMapTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var count int
for range out {
count++
}
if count != 0 {
t.Errorf("expected 0 findings, got %d", count)
}
}
func TestSourceMap_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(sourceMapFixtureJSON))
}))
defer srv.Close()
src := &SourceMapSource{
BaseURL: srv.URL,
Registry: sourceMapTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}
func TestSourceMap_EnabledAlwaysTrue(t *testing.T) {
s := &SourceMapSource{}
if !s.Enabled(recon.Config{}) {
t.Fatal("expected Enabled=true")
}
}
func TestSourceMap_NameAndRate(t *testing.T) {
s := &SourceMapSource{}
if s.Name() != "sourcemaps" {
t.Errorf("unexpected name: %s", s.Name())
}
if s.Burst() != 2 {
t.Errorf("burst: %d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}

View File

@@ -0,0 +1,118 @@
package sources
import (
"context"
"encoding/json"
"net/http"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// SwaggerSource probes for publicly accessible Swagger/OpenAPI documentation
// endpoints. Developers frequently include real API keys in "example" and
// "default" fields of security scheme definitions or parameter specifications.
type SwaggerSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
var _ recon.ReconSource = (*SwaggerSource)(nil)
func (s *SwaggerSource) Name() string { return "swagger" }
func (s *SwaggerSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *SwaggerSource) Burst() int { return 2 }
func (s *SwaggerSource) RespectsRobots() bool { return true }
func (s *SwaggerSource) Enabled(_ recon.Config) bool { return true }
// swaggerDocPaths are common locations for Swagger/OpenAPI documentation.
var swaggerDocPaths = []string{
"/swagger.json",
"/openapi.json",
"/api-docs",
"/v2/api-docs",
"/swagger/v1/swagger.json",
"/docs/openapi.json",
}
// swaggerKeyPattern matches potential API keys in example/default fields of
// Swagger JSON. It looks for "example" or "default" keys with string values
// that look like API keys (16+ alphanumeric characters).
var swaggerKeyPattern = regexp.MustCompile(`"(?:example|default)"\s*:\s*"([a-zA-Z0-9_\-]{16,})"`)
func (s *SwaggerSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
return nil
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "swagger")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
for _, path := range swaggerDocPaths {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
probeURL := base + path
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
if err != nil {
continue
}
req.Header.Set("Accept", "application/json")
resp, err := client.Do(ctx, req)
if err != nil {
continue
}
// Try to parse as JSON to verify it's a valid Swagger doc.
var doc map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&doc); err != nil {
_ = resp.Body.Close()
continue
}
_ = resp.Body.Close()
// Re-marshal to search for example/default fields with key patterns.
raw, err := json.Marshal(doc)
if err != nil {
continue
}
if swaggerKeyPattern.Match(raw) {
out <- recon.Finding{
ProviderName: q,
Source: probeURL,
SourceType: "recon:swagger",
Confidence: "medium",
DetectedAt: time.Now(),
}
}
}
}
return nil
}

View File

@@ -0,0 +1,179 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func swaggerTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const swaggerFixtureJSON = `{
"openapi": "3.0.0",
"info": {"title": "My API", "version": "1.0"},
"paths": {
"/api/data": {
"get": {
"parameters": [
{
"name": "X-API-Key",
"in": "header",
"schema": {"type": "string"},
"example": "sk-proj-abc123def456ghi789jkl"
}
]
}
}
},
"components": {
"securitySchemes": {
"apiKey": {
"type": "apiKey",
"in": "header",
"name": "Authorization",
"default": "Bearer sk-live-xxxxxxxxxxxxxxxxxxxx"
}
}
}
}`
const swaggerCleanFixtureJSON = `{
"openapi": "3.0.0",
"info": {"title": "My API", "version": "1.0"},
"paths": {
"/api/data": {
"get": {
"parameters": [
{
"name": "limit",
"in": "query",
"schema": {"type": "integer"},
"example": 10
}
]
}
}
}
}`
func TestSwagger_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(swaggerFixtureJSON))
}))
defer srv.Close()
src := &SwaggerSource{
BaseURL: srv.URL,
Registry: swaggerTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) == 0 {
t.Fatal("expected at least one finding")
}
for _, f := range findings {
if f.SourceType != "recon:swagger" {
t.Errorf("unexpected SourceType: %s", f.SourceType)
}
if f.Confidence != "medium" {
t.Errorf("unexpected Confidence: %s", f.Confidence)
}
}
}
func TestSwagger_Sweep_NoFindings_OnCleanDoc(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(swaggerCleanFixtureJSON))
}))
defer srv.Close()
src := &SwaggerSource{
BaseURL: srv.URL,
Registry: swaggerTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var count int
for range out {
count++
}
if count != 0 {
t.Errorf("expected 0 findings, got %d", count)
}
}
func TestSwagger_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(swaggerFixtureJSON))
}))
defer srv.Close()
src := &SwaggerSource{
BaseURL: srv.URL,
Registry: swaggerTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}
func TestSwagger_EnabledAlwaysTrue(t *testing.T) {
s := &SwaggerSource{}
if !s.Enabled(recon.Config{}) {
t.Fatal("expected Enabled=true")
}
}
func TestSwagger_NameAndRate(t *testing.T) {
s := &SwaggerSource{}
if s.Name() != "swagger" {
t.Errorf("unexpected name: %s", s.Name())
}
if s.Burst() != 2 {
t.Errorf("burst: %d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}

View File

@@ -0,0 +1,109 @@
package sources
import (
"context"
"fmt"
"io"
"net/http"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// WebpackSource probes for Webpack/Vite build artifacts that contain inlined
// environment variables. Bundlers like Webpack and Vite inline process.env.*
// values at build time, frequently shipping API keys to production bundles.
type WebpackSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
var _ recon.ReconSource = (*WebpackSource)(nil)
func (s *WebpackSource) Name() string { return "webpack" }
func (s *WebpackSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *WebpackSource) Burst() int { return 2 }
func (s *WebpackSource) RespectsRobots() bool { return true }
func (s *WebpackSource) Enabled(_ recon.Config) bool { return true }
// envVarPattern matches inlined environment variable patterns from bundlers.
var envVarPattern = regexp.MustCompile(`(?i)(NEXT_PUBLIC_|REACT_APP_|VITE_|VUE_APP_|NUXT_|GATSBY_)[A-Z_]*(API[_]?KEY|SECRET|TOKEN|PASSWORD)['":\s]*[=:,]\s*['"]([a-zA-Z0-9_\-]{8,})['"]`)
// webpackBundlePaths are common locations for JS bundle artifacts.
var webpackBundlePaths = []string{
"/static/js/main.js",
"/static/js/bundle.js",
"/_next/static/chunks/main.js",
"/assets/index.js",
"/dist/bundle.js",
"/build/static/js/main.js",
}
func (s *WebpackSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
return nil
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "webpack")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
for _, path := range webpackBundlePaths {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
probeURL := fmt.Sprintf("%s%s", base, path)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, probeURL, nil)
if err != nil {
continue
}
resp, err := client.Do(ctx, req)
if err != nil {
continue
}
body, err := io.ReadAll(io.LimitReader(resp.Body, 512*1024)) // 512KB max
_ = resp.Body.Close()
if err != nil {
continue
}
if envVarPattern.Match(body) {
out <- recon.Finding{
ProviderName: q,
Source: probeURL,
SourceType: "recon:webpack",
Confidence: "medium",
DetectedAt: time.Now(),
}
break // one finding per query is sufficient
}
}
}
return nil
}

View File

@@ -0,0 +1,146 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func webpackTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const webpackFixtureJS = `
!function(e){var t={};function n(r){if(t[r])return t[r].exports}
var config = {
NEXT_PUBLIC_API_KEY: "sk-proj-abc123def456ghi789jkl",
REACT_APP_SECRET: "super-secret-value-12345678"
};
module.exports = config;
`
const webpackCleanJS = `
!function(e){var t={};function n(r){if(t[r])return t[r].exports}
console.log("clean bundle");
module.exports = {};
`
func TestWebpack_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/javascript")
_, _ = w.Write([]byte(webpackFixtureJS))
}))
defer srv.Close()
src := &WebpackSource{
BaseURL: srv.URL,
Registry: webpackTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) == 0 {
t.Fatal("expected at least one finding")
}
for _, f := range findings {
if f.SourceType != "recon:webpack" {
t.Errorf("unexpected SourceType: %s", f.SourceType)
}
if f.Confidence != "medium" {
t.Errorf("unexpected Confidence: %s", f.Confidence)
}
}
}
func TestWebpack_Sweep_NoFindings_OnCleanBundle(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/javascript")
_, _ = w.Write([]byte(webpackCleanJS))
}))
defer srv.Close()
src := &WebpackSource{
BaseURL: srv.URL,
Registry: webpackTestRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 64)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var count int
for range out {
count++
}
if count != 0 {
t.Errorf("expected 0 findings, got %d", count)
}
}
func TestWebpack_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(webpackFixtureJS))
}))
defer srv.Close()
src := &WebpackSource{
BaseURL: srv.URL,
Registry: webpackTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}
func TestWebpack_EnabledAlwaysTrue(t *testing.T) {
s := &WebpackSource{}
if !s.Enabled(recon.Config{}) {
t.Fatal("expected Enabled=true")
}
}
func TestWebpack_NameAndRate(t *testing.T) {
s := &WebpackSource{}
if s.Name() != "webpack" {
t.Errorf("unexpected name: %s", s.Name())
}
if s.Burst() != 2 {
t.Errorf("burst: %d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}