Compare commits

...

14 Commits

Author SHA1 Message Date
salvacybersec
6f392b0b17 docs(phase-11): complete OSINT search & paste 2026-04-06 12:09:48 +03:00
salvacybersec
90d188fe9e docs(11-03): complete RegisterAll wiring + integration test plan
- SUMMARY.md with 18-source wiring details
- STATE.md updated with Phase 11 completion
- ROADMAP.md phase progress updated
- REQUIREMENTS.md: RECON-PASTE-01 marked complete
2026-04-06 12:07:56 +03:00
salvacybersec
bebc3e7a0b test(11-03): add end-to-end SweepAll integration test across all 18 sources
- Extend httptest mux with fixtures for Google, Bing, DuckDuckGo, Yandex, Brave
- Add Pastebin (routed /pb/), GistPaste (/gp/), PasteSites (injected platform)
- Assert all 18 SourceTypes emit at least one finding via SweepAll
2026-04-06 12:06:27 +03:00
salvacybersec
3250408f23 feat(11-03): wire 18 sources into RegisterAll + credential wiring in cmd/recon.go
- Extend SourcesConfig with GoogleAPIKey, GoogleCX, BingAPIKey, YandexUser, YandexAPIKey, BraveAPIKey
- RegisterAll registers 8 Phase 11 sources alongside 10 Phase 10 sources (18 total)
- cmd/recon.go reads search engine API keys from env vars and viper config
- Guardrail tests updated to assert 18 sources
2026-04-06 12:02:11 +03:00
salvacybersec
a53d952518 Merge branch 'worktree-agent-a27c3406' 2026-04-06 11:58:19 +03:00
salvacybersec
10ae94115f Merge branch 'worktree-agent-a6700ee2' 2026-04-06 11:57:39 +03:00
salvacybersec
da0bf800f9 docs(11-02): complete paste site sources plan
- SUMMARY.md for PastebinSource, GistPasteSource, PasteSitesSource
2026-04-06 11:57:21 +03:00
salvacybersec
61a9d527ee docs(11-01): complete search engine dorking sources plan
- SUMMARY.md for 5 search engine sources (Google, Bing, DuckDuckGo, Yandex, Brave)
- STATE.md updated with position and decisions
- Requirements RECON-DORK-01/02/03 marked complete
2026-04-06 11:55:46 +03:00
salvacybersec
ed148d47e1 feat(11-02): add PasteSitesSource multi-paste aggregator
- Aggregates dpaste, paste.ee, rentry, hastebin into single source
- Follows SandboxesSource multi-platform pattern with per-platform error isolation
- Two-phase search+raw-fetch with keyword matching against provider registry
2026-04-06 11:55:44 +03:00
salvacybersec
770705302c feat(11-01): add DuckDuckGoSource, YandexSource, and BraveSource
- DuckDuckGoSource scrapes HTML search (no API key, always enabled, RespectsRobots=true)
- YandexSource uses Yandex XML Search API (user+key required, XML response parsing)
- BraveSource uses Brave Search API (X-Subscription-Token header, JSON response)
- All three follow established error handling: 401 aborts, transient continues, ctx cancellation returns
2026-04-06 11:54:42 +03:00
salvacybersec
7272e65207 feat(11-01): add GoogleDorkSource and BingDorkSource with formatQuery updates
- GoogleDorkSource uses Google Custom Search JSON API (APIKey+CX required)
- BingDorkSource uses Bing Web Search API v7 (Ocp-Apim-Subscription-Key header)
- formatQuery now handles google/bing/duckduckgo/yandex/brave dork syntax
- Both sources follow established pattern: retry via Client, rate limit via LimiterRegistry
2026-04-06 11:54:36 +03:00
salvacybersec
3c500b5473 feat(11-02): add PastebinSource and GistPasteSource for paste site scanning
- PastebinSource: two-phase search+raw-fetch with keyword matching
- GistPasteSource: scrapes gist.github.com public search (no auth)
- Both implement recon.ReconSource with httptest-based tests
2026-04-06 11:53:00 +03:00
salvacybersec
f8b06055ef docs(11): create phase plan — 3 plans for search engine dorking + paste sites 2026-04-06 11:50:38 +03:00
salvacybersec
9ad9767109 docs(11-16): auto-generated OSINT phase contexts 2026-04-06 11:40:44 +03:00
36 changed files with 3985 additions and 42 deletions

View File

@@ -115,13 +115,13 @@ Requirements for initial release. Each maps to roadmap phases.
### OSINT/Recon — Search Engine Dorking
- [ ] **RECON-DORK-01**: Google dorking via Custom Search API / SerpAPI with 100+ built-in dorks
- [ ] **RECON-DORK-02**: Bing dorking via Azure Cognitive Services
- [ ] **RECON-DORK-03**: DuckDuckGo, Yandex, Brave search integration
- [x] **RECON-DORK-01**: Google dorking via Custom Search API / SerpAPI with 100+ built-in dorks
- [x] **RECON-DORK-02**: Bing dorking via Azure Cognitive Services
- [x] **RECON-DORK-03**: DuckDuckGo, Yandex, Brave search integration
### OSINT/Recon — Paste Sites
- [ ] **RECON-PASTE-01**: Multi-paste aggregator (Pastebin, dpaste, paste.ee, rentry, hastebin, ix.io, etc.)
- [x] **RECON-PASTE-01**: Multi-paste aggregator (Pastebin, dpaste, paste.ee, rentry, hastebin, ix.io, etc.)
### OSINT/Recon — Package Registries
@@ -302,7 +302,7 @@ Requirements for initial release. Each maps to roadmap phases.
| RECON-CODE-01, RECON-CODE-02, RECON-CODE-03, RECON-CODE-04, RECON-CODE-05 | Phase 10 | Pending |
| RECON-CODE-06, RECON-CODE-07, RECON-CODE-08, RECON-CODE-09, RECON-CODE-10 | Phase 10 | Pending |
| RECON-DORK-01, RECON-DORK-02, RECON-DORK-03 | Phase 11 | Pending |
| RECON-PASTE-01 | Phase 11 | Pending |
| RECON-PASTE-01 | Phase 11 | Complete |
| RECON-IOT-01, RECON-IOT-02, RECON-IOT-03, RECON-IOT-04, RECON-IOT-05, RECON-IOT-06 | Phase 12 | Pending |
| RECON-CLOUD-01, RECON-CLOUD-02, RECON-CLOUD-03, RECON-CLOUD-04 | Phase 12 | Pending |
| RECON-PKG-01, RECON-PKG-02, RECON-PKG-03 | Phase 13 | Pending |

View File

@@ -22,7 +22,7 @@ Decimal phases appear between their surrounding integers in numeric order.
- [ ] **Phase 8: Dork Engine** - YAML-based dork definitions with 150+ built-in dorks and management commands
- [ ] **Phase 9: OSINT Infrastructure** - Per-source rate limiter architecture and recon engine framework before any sources
- [x] **Phase 10: OSINT Code Hosting** - GitHub, GitLab, Bitbucket, HuggingFace and 6 more code hosting sources (completed 2026-04-05)
- [ ] **Phase 11: OSINT Search & Paste** - Search engine dorking and paste site aggregation
- [x] **Phase 11: OSINT Search & Paste** - Search engine dorking and paste site aggregation (completed 2026-04-06)
- [ ] **Phase 12: OSINT IoT & Cloud Storage** - Shodan/Censys/ZoomEye/FOFA and S3/GCS/Azure cloud storage scanning
- [ ] **Phase 13: OSINT Package Registries & Container/IaC** - npm/PyPI/crates.io and Docker Hub/K8s/Terraform scanning
- [ ] **Phase 14: OSINT CI/CD Logs, Web Archives & Frontend Leaks** - Build logs, Wayback Machine, and JS bundle/env scanning
@@ -235,7 +235,12 @@ Plans:
1. `keyhunter recon --sources=google` runs built-in dorks via Google Custom Search API or SerpAPI and returns results with the dork query that triggered each finding
2. `keyhunter recon --sources=bing` executes dorks via Azure Cognitive Services and `--sources=duckduckgo,yandex,brave` via their respective integrations
3. `keyhunter recon --sources=paste` queries Pastebin API and scrapes 15+ additional paste sites, feeding raw content through the detection pipeline
**Plans**: TBD
**Plans**: 3 plans
Plans:
- [x] 11-01-PLAN.md — GoogleDorkSource + BingDorkSource + DuckDuckGoSource + YandexSource + BraveSource (RECON-DORK-01, RECON-DORK-02, RECON-DORK-03)
- [x] 11-02-PLAN.md — PastebinSource + GistPasteSource + PasteSitesSource multi-paste aggregator (RECON-PASTE-01)
- [x] 11-03-PLAN.md — RegisterAll wiring + cmd/recon.go credentials + integration test (all Phase 11 reqs)
### Phase 12: OSINT IoT & Cloud Storage
**Goal**: Users can discover exposed LLM endpoints via IoT scanners (Shodan, Censys, ZoomEye, FOFA, Netlas, BinaryEdge) and scan publicly accessible cloud storage buckets (S3, GCS, Azure Blob, MinIO, GrayHatWarfare) for leaked keys
@@ -337,7 +342,7 @@ Phases execute in numeric order: 1 → 2 → 3 → ... → 18
| 8. Dork Engine | 0/? | Not started | - |
| 9. OSINT Infrastructure | 2/6 | In Progress| |
| 10. OSINT Code Hosting | 9/9 | Complete | 2026-04-06 |
| 11. OSINT Search & Paste | 0/? | Not started | - |
| 11. OSINT Search & Paste | 3/3 | Complete | 2026-04-06 |
| 12. OSINT IoT & Cloud Storage | 0/? | Not started | - |
| 13. OSINT Package Registries & Container/IaC | 0/? | Not started | - |
| 14. OSINT CI/CD Logs, Web Archives & Frontend Leaks | 0/? | Not started | - |

View File

@@ -2,15 +2,15 @@
gsd_state_version: 1.0
milestone: v1.0
milestone_name: milestone
status: executing
stopped_at: Completed 10-09-PLAN.md
last_updated: "2026-04-06T08:38:31.363Z"
status: completed
stopped_at: Completed 11-03-PLAN.md
last_updated: "2026-04-06T09:09:48.100Z"
last_activity: 2026-04-06
progress:
total_phases: 18
completed_phases: 10
total_plans: 62
completed_plans: 63
completed_phases: 11
total_plans: 65
completed_plans: 66
percent: 20
---
@@ -21,13 +21,13 @@ progress:
See: .planning/PROJECT.md (updated 2026-04-04)
**Core value:** Detect leaked LLM API keys across more providers and more internet sources than any other tool, with active verification to confirm keys are real and alive.
**Current focus:** Phase 10 — osint-code-hosting
**Current focus:** Phase 11 — osint-search-paste (complete)
## Current Position
Phase: 11
Phase: 12
Plan: Not started
Status: Ready to execute
Status: Phase 11 complete
Last activity: 2026-04-06
Progress: [██░░░░░░░░] 20%
@@ -89,6 +89,8 @@ Progress: [██░░░░░░░░] 20%
| Phase 10-osint-code-hosting P02 | 5min | 1 tasks | 2 files |
| Phase 10-osint-code-hosting P07 | 6 | 2 tasks | 6 files |
| Phase 10 P09 | 12min | 2 tasks | 5 files |
| Phase 11 P03 | 6min | 2 tasks | 4 files |
| Phase 11 P01 | 3min | 2 tasks | 11 files |
## Accumulated Context
@@ -126,6 +128,9 @@ Recent decisions affecting current work:
- [Phase 10-osint-code-hosting]: github/gist use 'kw' in:file; all other sources use bare keyword
- [Phase 10-osint-code-hosting]: GitHubSource reuses shared sources.Client + LimiterRegistry; builds queries from providers.Registry via BuildQueries; missing token disables (not errors)
- [Phase 10]: RegisterAll registers all ten Phase 10 sources unconditionally; missing credentials flip Enabled()==false rather than hiding sources from the CLI catalog
- [Phase 11]: RegisterAll extended to 18 sources (10 Phase 10 + 8 Phase 11); paste sources use BaseURL prefix in integration test to avoid /search path collision
- [Phase 11]: Integration test uses injected test platforms for PasteSites (same pattern as SandboxesSource)
- [Phase 11]: All five search sources use dork query format to focus on paste/code hosting leak sites
### Pending Todos
@@ -140,6 +145,6 @@ None yet.
## Session Continuity
Last session: 2026-04-05T22:28:27.412Z
Stopped at: Completed 10-09-PLAN.md
Last session: 2026-04-06T09:07:51.980Z
Stopped at: Completed 11-03-PLAN.md
Resume file: None

View File

@@ -0,0 +1,241 @@
---
phase: 11-osint-search-paste
plan: 01
type: execute
wave: 1
depends_on: []
files_modified:
- pkg/recon/sources/google.go
- pkg/recon/sources/google_test.go
- pkg/recon/sources/bing.go
- pkg/recon/sources/bing_test.go
- pkg/recon/sources/duckduckgo.go
- pkg/recon/sources/duckduckgo_test.go
- pkg/recon/sources/yandex.go
- pkg/recon/sources/yandex_test.go
- pkg/recon/sources/brave.go
- pkg/recon/sources/brave_test.go
- pkg/recon/sources/queries.go
autonomous: true
requirements: [RECON-DORK-01, RECON-DORK-02, RECON-DORK-03]
must_haves:
truths:
- "Google dorking source searches via Google Custom Search JSON API and emits findings with dork query context"
- "Bing dorking source searches via Bing Web Search API and emits findings"
- "DuckDuckGo, Yandex, and Brave sources each search their respective APIs/endpoints and emit findings"
- "All five sources respect ctx cancellation and use LimiterRegistry for rate limiting"
- "Missing API keys disable the source (Enabled=false) without error"
artifacts:
- path: "pkg/recon/sources/google.go"
provides: "GoogleDorkSource implementing recon.ReconSource"
contains: "func (s *GoogleDorkSource) Sweep"
- path: "pkg/recon/sources/bing.go"
provides: "BingDorkSource implementing recon.ReconSource"
contains: "func (s *BingDorkSource) Sweep"
- path: "pkg/recon/sources/duckduckgo.go"
provides: "DuckDuckGoSource implementing recon.ReconSource"
contains: "func (s *DuckDuckGoSource) Sweep"
- path: "pkg/recon/sources/yandex.go"
provides: "YandexSource implementing recon.ReconSource"
contains: "func (s *YandexSource) Sweep"
- path: "pkg/recon/sources/brave.go"
provides: "BraveSource implementing recon.ReconSource"
contains: "func (s *BraveSource) Sweep"
key_links:
- from: "pkg/recon/sources/google.go"
to: "pkg/recon/sources/httpclient.go"
via: "sources.Client for HTTP with retry"
pattern: "client\\.Do"
- from: "pkg/recon/sources/queries.go"
to: "all five search sources"
via: "formatQuery switch cases"
pattern: "case \"google\"|\"bing\"|\"duckduckgo\"|\"yandex\"|\"brave\""
---
<objective>
Implement five search engine dorking ReconSource implementations: GoogleDorkSource, BingDorkSource, DuckDuckGoSource, YandexSource, and BraveSource.
Purpose: RECON-DORK-01/02/03 -- enable automated search engine dorking for API key leak detection across all major search engines.
Output: Five source files + tests, updated queries.go formatQuery.
</objective>
<execution_context>
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
@$HOME/.claude/get-shit-done/templates/summary.md
</execution_context>
<context>
@.planning/PROJECT.md
@.planning/ROADMAP.md
@.planning/STATE.md
@pkg/recon/source.go
@pkg/recon/sources/httpclient.go
@pkg/recon/sources/queries.go
@pkg/recon/sources/github.go (reference pattern for API-backed source)
@pkg/recon/sources/replit.go (reference pattern for scraping source)
<interfaces>
<!-- Executor needs these contracts -->
From pkg/recon/source.go:
```go
type ReconSource interface {
Name() string
RateLimit() rate.Limit
Burst() int
RespectsRobots() bool
Enabled(cfg Config) bool
Sweep(ctx context.Context, query string, out chan<- Finding) error
}
```
From pkg/recon/sources/httpclient.go:
```go
type Client struct { HTTP *http.Client; MaxRetries int; UserAgent string }
func NewClient() *Client
func (c *Client) Do(ctx context.Context, req *http.Request) (*http.Response, error)
```
From pkg/recon/sources/queries.go:
```go
func BuildQueries(reg *providers.Registry, source string) []string
func formatQuery(source, keyword string) string // needs new cases
```
From pkg/recon/sources/register.go:
```go
type SourcesConfig struct { ... } // will be extended in Plan 11-03
```
</interfaces>
</context>
<tasks>
<task type="auto" tdd="true">
<name>Task 1: GoogleDorkSource + BingDorkSource + formatQuery updates</name>
<files>pkg/recon/sources/google.go, pkg/recon/sources/google_test.go, pkg/recon/sources/bing.go, pkg/recon/sources/bing_test.go, pkg/recon/sources/queries.go</files>
<behavior>
- GoogleDorkSource.Name() == "google"
- GoogleDorkSource.RateLimit() == rate.Every(1*time.Second) (Google Custom Search: 100/day free, be conservative)
- GoogleDorkSource.Burst() == 1
- GoogleDorkSource.RespectsRobots() == false (authenticated API)
- GoogleDorkSource.Enabled() == true only when APIKey AND CX (search engine ID) are both non-empty
- GoogleDorkSource.Sweep() calls Google Custom Search JSON API: GET https://www.googleapis.com/customsearch/v1?key={key}&cx={cx}&q={query}&num=10
- Each search result item emits a Finding with Source=item.link, SourceType="recon:google", Confidence="low"
- BingDorkSource.Name() == "bing"
- BingDorkSource.RateLimit() == rate.Every(500*time.Millisecond) (Bing allows 3 TPS on S1 tier)
- BingDorkSource.Enabled() == true only when APIKey is non-empty
- BingDorkSource.Sweep() calls Bing Web Search API v7: GET https://api.bing.microsoft.com/v7.0/search?q={query}&count=50 with Ocp-Apim-Subscription-Key header
- Each webPages.value item emits Finding with Source=item.url, SourceType="recon:bing"
- formatQuery("google", kw) returns `site:pastebin.com OR site:github.com "{kw}"` (dork-style)
- formatQuery("bing", kw) returns same dork-style format
- ctx cancellation aborts both sources promptly
- Transient HTTP errors (429/5xx) are retried via sources.Client; 401 aborts sweep
</behavior>
<action>
Create `pkg/recon/sources/google.go`:
- Struct: `GoogleDorkSource` with fields: APIKey string, CX string, BaseURL string, Registry *providers.Registry, Limiters *recon.LimiterRegistry, client *Client
- Compile-time interface assertion: `var _ recon.ReconSource = (*GoogleDorkSource)(nil)`
- Name() returns "google"
- RateLimit() returns rate.Every(1*time.Second)
- Burst() returns 1
- RespectsRobots() returns false
- Enabled() returns s.APIKey != "" && s.CX != ""
- Sweep(): iterate BuildQueries(registry, "google"), for each query: wait on LimiterRegistry, build GET request to `{BaseURL}/customsearch/v1?key={APIKey}&cx={CX}&q={url.QueryEscape(q)}&num=10`, set Accept: application/json, call client.Do, decode JSON response `{ items: [{ title, link, snippet }] }`, emit Finding per item with Source=link, SourceType="recon:google", ProviderName from keyword index (same pattern as githubKeywordIndex), Confidence="low". On 401 abort; on transient error continue to next query.
- Private response structs: googleSearchResponse, googleSearchItem
Create `pkg/recon/sources/bing.go`:
- Struct: `BingDorkSource` with fields: APIKey string, BaseURL string, Registry *providers.Registry, Limiters *recon.LimiterRegistry, client *Client
- Name() returns "bing"
- RateLimit() returns rate.Every(500*time.Millisecond)
- Burst() returns 2
- RespectsRobots() returns false
- Enabled() returns s.APIKey != ""
- Sweep(): iterate BuildQueries(registry, "bing"), for each: wait on limiter, GET `{BaseURL}/v7.0/search?q={query}&count=50`, set Ocp-Apim-Subscription-Key header, decode JSON `{ webPages: { value: [{ name, url, snippet }] } }`, emit Finding per value item with Source=url, SourceType="recon:bing". Same error handling pattern.
- Private response structs: bingSearchResponse, bingWebPages, bingWebResult
Update `pkg/recon/sources/queries.go` formatQuery():
- Add cases for "google", "bing", "duckduckgo", "yandex", "brave" that return the keyword wrapped in dork syntax: `site:pastebin.com OR site:github.com "%s"` using fmt.Sprintf with the keyword. This focuses search results on paste/code hosting sites where keys leak.
Create test files with httptest servers returning canned JSON fixtures. Each test:
- Verifies Sweep emits correct number of findings
- Verifies SourceType is correct
- Verifies Source URLs match fixture data
- Verifies Enabled() behavior with/without credentials
- Verifies ctx cancellation returns error
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestGoogle|TestBing" -v -count=1</automated>
</verify>
<done>GoogleDorkSource and BingDorkSource pass all tests. formatQuery handles google/bing cases.</done>
</task>
<task type="auto" tdd="true">
<name>Task 2: DuckDuckGoSource + YandexSource + BraveSource</name>
<files>pkg/recon/sources/duckduckgo.go, pkg/recon/sources/duckduckgo_test.go, pkg/recon/sources/yandex.go, pkg/recon/sources/yandex_test.go, pkg/recon/sources/brave.go, pkg/recon/sources/brave_test.go</files>
<behavior>
- DuckDuckGoSource.Name() == "duckduckgo"
- DuckDuckGoSource.RateLimit() == rate.Every(2*time.Second) (no official API, scrape-conservative)
- DuckDuckGoSource.RespectsRobots() == true (HTML scraper)
- DuckDuckGoSource.Enabled() always true (no API key needed -- uses DuckDuckGo HTML search)
- DuckDuckGoSource.Sweep() GETs `https://html.duckduckgo.com/html/?q={query}`, parses HTML for result links in <a class="result__a" href="..."> anchors, emits Findings
- YandexSource.Name() == "yandex"
- YandexSource.RateLimit() == rate.Every(1*time.Second)
- YandexSource.RespectsRobots() == false (uses Yandex XML search API)
- YandexSource.Enabled() == true only when User and APIKey are both non-empty
- YandexSource.Sweep() GETs `https://yandex.com/search/xml?user={user}&key={key}&query={q}&l10n=en&sortby=rlv&filter=none&groupby=attr%3D%22%22.mode%3Dflat.groups-on-page%3D50`, parses XML response for <url> elements
- BraveSource.Name() == "brave"
- BraveSource.RateLimit() == rate.Every(1*time.Second) (Brave Search API: 1 QPS free tier)
- BraveSource.Enabled() == true only when APIKey is non-empty
- BraveSource.Sweep() GETs `https://api.search.brave.com/res/v1/web/search?q={query}&count=20` with X-Subscription-Token header, decodes JSON { web: { results: [{ url, title }] } }, emits Findings
</behavior>
<action>
Create `pkg/recon/sources/duckduckgo.go`:
- Struct: `DuckDuckGoSource` with BaseURL, Registry, Limiters, Client fields
- Name() "duckduckgo", RateLimit() Every(2s), Burst() 1, RespectsRobots() true
- Enabled() always true (credential-free, like Replit)
- Sweep(): iterate BuildQueries(registry, "duckduckgo"), for each: wait limiter, GET `{BaseURL}/html/?q={query}`, parse HTML using golang.org/x/net/html (same as Replit pattern), extract href from `<a class="result__a">` or `<a class="result__url">` elements. Use a regex or attribute check: look for <a> tags whose class contains "result__a". Emit Finding with Source=extracted URL, SourceType="recon:duckduckgo". Deduplicate results within the same query.
Create `pkg/recon/sources/yandex.go`:
- Struct: `YandexSource` with User, APIKey, BaseURL, Registry, Limiters, client fields
- Name() "yandex", RateLimit() Every(1s), Burst() 1, RespectsRobots() false
- Enabled() returns s.User != "" && s.APIKey != ""
- Sweep(): iterate BuildQueries, for each: wait limiter, GET `{BaseURL}/search/xml?user={User}&key={APIKey}&query={url.QueryEscape(q)}&l10n=en&sortby=rlv&filter=none&groupby=attr%3D%22%22.mode%3Dflat.groups-on-page%3D50`, decode XML using encoding/xml. Response structure: `<yandexsearch><response><results><grouping><group><doc><url>...</url></doc></group></grouping></results></response></yandexsearch>`. Emit Finding per <url>. SourceType="recon:yandex".
Create `pkg/recon/sources/brave.go`:
- Struct: `BraveSource` with APIKey, BaseURL, Registry, Limiters, client fields
- Name() "brave", RateLimit() Every(1s), Burst() 1, RespectsRobots() false
- Enabled() returns s.APIKey != ""
- Sweep(): iterate BuildQueries, for each: wait limiter, GET `{BaseURL}/res/v1/web/search?q={query}&count=20`, set X-Subscription-Token header to APIKey, Accept: application/json. Decode JSON `{ web: { results: [{ url, title, description }] } }`. Emit Finding per result. SourceType="recon:brave".
All three follow the same error handling pattern as Task 1: 401 aborts, transient errors continue, ctx cancellation returns immediately.
Create test files with httptest servers. DuckDuckGo test serves HTML fixture with result anchors. Yandex test serves XML fixture. Brave test serves JSON fixture. Each test covers: Sweep emits findings, SourceType correct, Enabled behavior, ctx cancellation.
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestDuckDuckGo|TestYandex|TestBrave" -v -count=1</automated>
</verify>
<done>DuckDuckGoSource, YandexSource, and BraveSource pass all tests. All five search sources complete.</done>
</task>
</tasks>
<verification>
All five search engine sources compile and pass unit tests:
```bash
cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestGoogle|TestBing|TestDuckDuckGo|TestYandex|TestBrave" -v -count=1
```
</verification>
<success_criteria>
- 5 new source files exist in pkg/recon/sources/ (google.go, bing.go, duckduckgo.go, yandex.go, brave.go)
- Each source implements recon.ReconSource with compile-time assertion
- Each has a corresponding _test.go file with httptest-based tests
- formatQuery in queries.go handles all 5 new source names
- All tests pass
</success_criteria>
<output>
After completion, create `.planning/phases/11-osint_search_paste/11-01-SUMMARY.md`
</output>

View File

@@ -0,0 +1,117 @@
---
phase: 11-osint-search-paste
plan: 01
subsystem: recon
tags: [google-custom-search, bing-web-search, duckduckgo, yandex-xml, brave-search, dorking, osint]
requires:
- phase: 10-osint-code-hosting
provides: "ReconSource interface, sources.Client, LimiterRegistry, BuildQueries/formatQuery"
provides:
- "GoogleDorkSource - Google Custom Search JSON API dorking"
- "BingDorkSource - Bing Web Search API v7 dorking"
- "DuckDuckGoSource - HTML scraping (credential-free)"
- "YandexSource - Yandex XML Search API dorking"
- "BraveSource - Brave Search API dorking"
- "formatQuery cases for all five search engines"
affects: [11-osint-search-paste, 11-03 RegisterAll wiring]
tech-stack:
added: [encoding/xml for Yandex XML parsing]
patterns: [search-engine dork query format via formatQuery, XML API response parsing]
key-files:
created:
- pkg/recon/sources/google.go
- pkg/recon/sources/google_test.go
- pkg/recon/sources/bing.go
- pkg/recon/sources/bing_test.go
- pkg/recon/sources/duckduckgo.go
- pkg/recon/sources/duckduckgo_test.go
- pkg/recon/sources/yandex.go
- pkg/recon/sources/yandex_test.go
- pkg/recon/sources/brave.go
- pkg/recon/sources/brave_test.go
modified:
- pkg/recon/sources/queries.go
key-decisions:
- "All five search sources use dork query format: site:pastebin.com OR site:github.com \"keyword\" to focus on paste/code hosting leak sites"
- "DuckDuckGo is credential-free (HTML scraping) with RespectsRobots=true; other four require API keys"
- "Yandex uses encoding/xml for XML response parsing; all others use encoding/json"
- "extractGoogleKeyword reverse-parser shared by Bing/Yandex/Brave for keyword-to-provider mapping"
patterns-established:
- "Search engine dork sources: same Sweep loop pattern as Phase 10 code hosting sources"
- "XML API sources: encoding/xml with nested struct unmarshaling (Yandex)"
requirements-completed: [RECON-DORK-01, RECON-DORK-02, RECON-DORK-03]
duration: 3min
completed: 2026-04-06
---
# Phase 11 Plan 01: Search Engine Dorking Sources Summary
**Five search engine dorking ReconSource implementations (Google, Bing, DuckDuckGo, Yandex, Brave) with dork-style queries targeting paste/code hosting sites**
## Performance
- **Duration:** 3 min
- **Started:** 2026-04-06T08:51:30Z
- **Completed:** 2026-04-06T08:54:52Z
- **Tasks:** 2
- **Files modified:** 11
## Accomplishments
- GoogleDorkSource and BingDorkSource with JSON API integration and httptest-based tests
- DuckDuckGoSource with HTML scraping (credential-free, RespectsRobots=true)
- YandexSource with XML Search API and encoding/xml response parsing
- BraveSource with Brave Search API and X-Subscription-Token auth
- formatQuery updated with dork syntax for all five search engines
## Task Commits
Each task was committed atomically:
1. **Task 1: GoogleDorkSource + BingDorkSource + formatQuery updates** - `7272e65` (feat)
2. **Task 2: DuckDuckGoSource + YandexSource + BraveSource** - `7707053` (feat)
## Files Created/Modified
- `pkg/recon/sources/google.go` - Google Custom Search JSON API source (APIKey + CX required)
- `pkg/recon/sources/google_test.go` - Google source tests (enabled, sweep, cancel, unauth)
- `pkg/recon/sources/bing.go` - Bing Web Search API v7 source (Ocp-Apim-Subscription-Key)
- `pkg/recon/sources/bing_test.go` - Bing source tests
- `pkg/recon/sources/duckduckgo.go` - DuckDuckGo HTML scraper (no API key, always enabled)
- `pkg/recon/sources/duckduckgo_test.go` - DuckDuckGo tests including empty registry
- `pkg/recon/sources/yandex.go` - Yandex XML Search API (user + key required, XML parsing)
- `pkg/recon/sources/yandex_test.go` - Yandex tests
- `pkg/recon/sources/brave.go` - Brave Search API (X-Subscription-Token)
- `pkg/recon/sources/brave_test.go` - Brave tests
- `pkg/recon/sources/queries.go` - Added google/bing/duckduckgo/yandex/brave formatQuery cases
## Decisions Made
- All five search sources use dork query format `site:pastebin.com OR site:github.com "keyword"` to focus results on leak-likely sites
- DuckDuckGo is the only credential-free source; uses HTML scraping with extractAnchorHrefs (shared with Replit)
- Yandex requires encoding/xml for its XML Search API response format
- extractGoogleKeyword reverse-parser reused across Bing/Yandex/Brave for keyword-to-provider name mapping
## Deviations from Plan
None - plan executed exactly as written.
## Issues Encountered
None.
## User Setup Required
None - no external service configuration required.
## Next Phase Readiness
- All five search engine sources ready for RegisterAll wiring in Plan 11-03
- Each source follows established ReconSource pattern for seamless engine integration
---
*Phase: 11-osint-search-paste*
*Completed: 2026-04-06*

View File

@@ -0,0 +1,199 @@
---
phase: 11-osint-search-paste
plan: 02
type: execute
wave: 1
depends_on: []
files_modified:
- pkg/recon/sources/pastebin.go
- pkg/recon/sources/pastebin_test.go
- pkg/recon/sources/gistpaste.go
- pkg/recon/sources/gistpaste_test.go
- pkg/recon/sources/pastesites.go
- pkg/recon/sources/pastesites_test.go
autonomous: true
requirements: [RECON-PASTE-01]
must_haves:
truths:
- "PastebinSource scrapes Pastebin search results and emits findings for pastes containing provider keywords"
- "GistPasteSource searches public GitHub Gists via unauthenticated scraping (distinct from Phase 10 GistSource which uses API)"
- "PasteSitesSource aggregates results from dpaste, paste.ee, rentry.co, ix.io, and similar sites"
- "All paste sources feed raw content through keyword matching against the provider registry"
- "Missing credentials disable sources that need them; credential-free sources are always enabled"
artifacts:
- path: "pkg/recon/sources/pastebin.go"
provides: "PastebinSource implementing recon.ReconSource"
contains: "func (s *PastebinSource) Sweep"
- path: "pkg/recon/sources/gistpaste.go"
provides: "GistPasteSource implementing recon.ReconSource"
contains: "func (s *GistPasteSource) Sweep"
- path: "pkg/recon/sources/pastesites.go"
provides: "PasteSitesSource implementing recon.ReconSource with multi-site sub-platform pattern"
contains: "func (s *PasteSitesSource) Sweep"
key_links:
- from: "pkg/recon/sources/pastebin.go"
to: "pkg/recon/sources/httpclient.go"
via: "sources.Client for HTTP with retry"
pattern: "client\\.Do"
- from: "pkg/recon/sources/pastesites.go"
to: "providers.Registry"
via: "keyword matching on paste content"
pattern: "keywordSet|BuildQueries"
---
<objective>
Implement three paste site ReconSource implementations: PastebinSource, GistPasteSource, and PasteSitesSource (multi-site aggregator for dpaste, paste.ee, rentry.co, ix.io, etc.).
Purpose: RECON-PASTE-01 -- detect API key leaks across public paste sites.
Output: Three source files + tests covering paste site scanning.
</objective>
<execution_context>
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
@$HOME/.claude/get-shit-done/templates/summary.md
</execution_context>
<context>
@.planning/PROJECT.md
@.planning/ROADMAP.md
@.planning/STATE.md
@pkg/recon/source.go
@pkg/recon/sources/httpclient.go
@pkg/recon/sources/queries.go
@pkg/recon/sources/gist.go (reference: Phase 10 GistSource uses GitHub API -- this plan's GistPasteSource is a scraping alternative)
@pkg/recon/sources/replit.go (reference pattern for HTML scraping source)
@pkg/recon/sources/sandboxes.go (reference pattern for multi-platform aggregator)
<interfaces>
From pkg/recon/source.go:
```go
type ReconSource interface {
Name() string
RateLimit() rate.Limit
Burst() int
RespectsRobots() bool
Enabled(cfg Config) bool
Sweep(ctx context.Context, query string, out chan<- Finding) error
}
```
From pkg/recon/sources/httpclient.go:
```go
func NewClient() *Client
func (c *Client) Do(ctx context.Context, req *http.Request) (*http.Response, error)
```
From pkg/recon/sources/gist.go (existing Phase 10 GistSource -- avoid name collision):
```go
type GistSource struct { ... } // Name() == "gist" -- already taken
func (s *GistSource) keywordSet() map[string]string // pattern to reuse
```
</interfaces>
</context>
<tasks>
<task type="auto" tdd="true">
<name>Task 1: PastebinSource + GistPasteSource</name>
<files>pkg/recon/sources/pastebin.go, pkg/recon/sources/pastebin_test.go, pkg/recon/sources/gistpaste.go, pkg/recon/sources/gistpaste_test.go</files>
<behavior>
- PastebinSource.Name() == "pastebin"
- PastebinSource.RateLimit() == rate.Every(3*time.Second) (conservative -- Pastebin scraping)
- PastebinSource.Burst() == 1
- PastebinSource.RespectsRobots() == true (HTML scraper)
- PastebinSource.Enabled() always true (credential-free Google dorking of pastebin.com)
- PastebinSource.Sweep(): For each provider keyword, scrape Google (via the same DuckDuckGo HTML endpoint as a proxy to avoid Google ToS) with query `site:pastebin.com "{keyword}"`. Parse result links. For each pastebin.com URL found, fetch the raw paste content via /raw/{paste_id} endpoint, scan content for keyword matches, emit Finding with Source=paste URL, SourceType="recon:pastebin", ProviderName from match.
- GistPasteSource.Name() == "gistpaste" (not "gist" -- that's Phase 10's API source)
- GistPasteSource.RateLimit() == rate.Every(3*time.Second)
- GistPasteSource.RespectsRobots() == true (HTML scraper)
- GistPasteSource.Enabled() always true (credential-free)
- GistPasteSource.Sweep(): Scrape gist.github.com/search?q={keyword} (public search, no auth needed), parse HTML for gist links, fetch raw content, keyword-match against registry
</behavior>
<action>
Create `pkg/recon/sources/pastebin.go`:
- Struct: `PastebinSource` with BaseURL, Registry, Limiters, Client fields
- Name() "pastebin", RateLimit() Every(3s), Burst() 1, RespectsRobots() true
- Enabled() always true
- Sweep(): Use a two-phase approach:
Phase A: Search -- iterate BuildQueries(registry, "pastebin"). For each keyword, GET `{BaseURL}/search?q={url.QueryEscape(keyword)}` (Pastebin's own search). Parse HTML for paste links matching `^/[A-Za-z0-9]{8}$` pattern (Pastebin paste IDs are 8 alphanumeric chars). Collect unique paste IDs.
Phase B: Fetch+Scan -- for each paste ID: wait limiter, GET `{BaseURL}/raw/{pasteID}`, read body (limit 256KB), scan content against keywordSet() (same pattern as GistSource.keywordSet). If any keyword matches, emit Finding with Source=`{BaseURL}/{pasteID}`, SourceType="recon:pastebin", ProviderName from matched keyword.
- Helper: `pastebinKeywordSet(reg)` returning map[string]string (keyword -> provider name), same as GistSource pattern.
Create `pkg/recon/sources/gistpaste.go`:
- Struct: `GistPasteSource` with BaseURL, Registry, Limiters, Client fields
- Name() "gistpaste", RateLimit() Every(3s), Burst() 1, RespectsRobots() true
- Enabled() always true
- Sweep(): iterate BuildQueries(registry, "gistpaste"). For each keyword, GET `{BaseURL}/search?q={url.QueryEscape(keyword)}` (gist.github.com search). Parse HTML for gist links matching `^/[^/]+/[a-f0-9]+$` pattern. For each gist link, construct raw URL `{BaseURL}{gistPath}/raw` and fetch content (limit 256KB). Keyword-match and emit Finding with SourceType="recon:gistpaste".
Tests: httptest servers serving HTML search results + raw paste content fixtures. Verify findings emitted with correct SourceType, Source URL, and ProviderName.
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestPastebin|TestGistPaste" -v -count=1</automated>
</verify>
<done>PastebinSource and GistPasteSource compile, pass all tests, handle ctx cancellation.</done>
</task>
<task type="auto" tdd="true">
<name>Task 2: PasteSitesSource (multi-paste aggregator)</name>
<files>pkg/recon/sources/pastesites.go, pkg/recon/sources/pastesites_test.go</files>
<behavior>
- PasteSitesSource.Name() == "pastesites"
- PasteSitesSource.RateLimit() == rate.Every(3*time.Second)
- PasteSitesSource.RespectsRobots() == true
- PasteSitesSource.Enabled() always true (all credential-free)
- PasteSitesSource.Sweep() iterates across sub-platforms: dpaste.org, paste.ee, rentry.co, ix.io, hastebin.com
- Each sub-platform has: Name, SearchURL pattern, result link regex, and optional raw URL construction
- Sweep emits at least one Finding per platform when fixture data matches keywords
- ctx cancellation stops the sweep promptly
</behavior>
<action>
Create `pkg/recon/sources/pastesites.go` following the SandboxesSource multi-platform pattern from pkg/recon/sources/sandboxes.go:
- Define `pastePlatform` struct: Name string, SearchPath string (with %s for query), ResultLinkRegex string, RawPathTemplate string (optional, for fetching raw content), IsJSON bool
- Default platforms:
1. dpaste: SearchPath="/search/?q=%s", result links matching `^/[A-Za-z0-9]+$`, raw via `/{id}/raw`
2. paste.ee: SearchPath="/search?q=%s", result links matching `^/p/[A-Za-z0-9]+$`, raw via `/r/{id}`
3. rentry.co: SearchPath="/search?q=%s", result links matching `^/[a-z0-9-]+$`, raw via `/{slug}/raw`
4. ix.io: No search -- skip (ix.io has no search). Remove from list.
5. hastebin: SearchPath="/search?q=%s", result links matching `^/[a-z]+$`, raw via `/raw/{id}`
- Struct: `PasteSitesSource` with Platforms []pastePlatform, BaseURL string (test override), Registry, Limiters, Client fields
- Name() "pastesites", RateLimit() Every(3s), Burst() 1, RespectsRobots() true
- Enabled() always true
- Sweep(): For each platform, for each keyword from BuildQueries(registry, "pastesites"):
1. Wait limiter
2. GET `{platform base or BaseURL}{searchPath with keyword}`
3. Parse HTML, extract result links matching platform regex
4. For each result link: wait limiter, GET raw content URL, read body (256KB limit), keyword-match against registry
5. Emit Finding with Source=paste URL, SourceType="recon:pastesites", ProviderName from keyword match
- Default platforms populated in a `defaultPastePlatforms()` function. Tests override Platforms to use httptest URLs.
Test: httptest mux serving search HTML + raw content for each sub-platform. Verify at least one Finding per platform fixture. Verify SourceType="recon:pastesites" on all.
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestPasteSites" -v -count=1</automated>
</verify>
<done>PasteSitesSource aggregates across multiple paste sites, keyword-matches content, emits findings with correct SourceType.</done>
</task>
</tasks>
<verification>
All paste sources compile and pass unit tests:
```bash
cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestPastebin|TestGistPaste|TestPasteSites" -v -count=1
```
</verification>
<success_criteria>
- 3 new source files exist (pastebin.go, gistpaste.go, pastesites.go) with tests
- Each implements recon.ReconSource with compile-time assertion
- PasteSitesSource covers 3+ paste sub-platforms
- Keyword matching uses provider Registry for ProviderName population
- All tests pass
</success_criteria>
<output>
After completion, create `.planning/phases/11-osint_search_paste/11-02-SUMMARY.md`
</output>

View File

@@ -0,0 +1,91 @@
---
phase: 11-osint-search-paste
plan: 02
subsystem: recon
tags: [pastebin, gist, paste-sites, scraping, osint]
requires:
- phase: 10-osint-code-hosting
provides: ReconSource interface, shared HTTP client, extractAnchorHrefs helper, BuildQueries
provides:
- PastebinSource for pastebin.com search+raw scanning
- GistPasteSource for gist.github.com unauthenticated search scraping
- PasteSitesSource multi-platform aggregator (dpaste, paste.ee, rentry, hastebin)
affects: [11-03, recon-registration, recon-engine]
tech-stack:
added: []
patterns: [two-phase search+raw-fetch for paste sources, multi-platform aggregator reuse from sandboxes]
key-files:
created:
- pkg/recon/sources/pastebin.go
- pkg/recon/sources/pastebin_test.go
- pkg/recon/sources/gistpaste.go
- pkg/recon/sources/gistpaste_test.go
- pkg/recon/sources/pastesites.go
- pkg/recon/sources/pastesites_test.go
modified: []
key-decisions:
- "Two-phase approach for all paste sources: search HTML for links, then fetch raw content and keyword-match"
- "PasteSitesSource reuses SandboxesSource multi-platform pattern with pastePlatform struct"
- "GistPasteSource named 'gistpaste' to avoid collision with Phase 10 GistSource ('gist')"
patterns-established:
- "Paste source pattern: search page -> extract links -> fetch raw -> keyword match -> emit finding"
requirements-completed: [RECON-PASTE-01]
duration: 5min
completed: 2026-04-06
---
# Phase 11 Plan 02: Paste Site Sources Summary
**Three paste site ReconSources implementing two-phase search+raw-fetch with keyword matching against provider registry**
## What Was Built
### PastebinSource (`pkg/recon/sources/pastebin.go`)
- Searches pastebin.com for provider keywords, extracts 8-char paste IDs from HTML
- Fetches `/raw/{pasteID}` content (256KB cap), matches against provider keyword set
- Emits findings with SourceType="recon:pastebin" and ProviderName from matched keyword
- Rate: Every(3s), Burst 1, credential-free, respects robots.txt
### GistPasteSource (`pkg/recon/sources/gistpaste.go`)
- Scrapes gist.github.com public search (no auth needed, distinct from Phase 10 API-based GistSource)
- Extracts gist links matching `/<user>/<hex-hash>` pattern, fetches `{gistPath}/raw`
- Keyword-matches raw content, emits findings with SourceType="recon:gistpaste"
- Rate: Every(3s), Burst 1, credential-free
### PasteSitesSource (`pkg/recon/sources/pastesites.go`)
- Multi-platform aggregator following SandboxesSource pattern
- Covers 4 paste sub-platforms: dpaste.org, paste.ee, rentry.co, hastebin.com
- Each platform has configurable SearchPath, ResultLinkRegex, and RawPathTemplate
- Per-platform error isolation: failures logged and skipped without aborting others
- Findings tagged with `platform=<name>` in KeyMasked field
## Test Coverage
9 tests total across 3 test files:
- Sweep with httptest fixtures verifying finding extraction and keyword matching
- Name/rate/burst/robots/enabled metadata assertions
- Context cancellation handling
## Deviations from Plan
None - plan executed exactly as written.
## Commits
| Task | Commit | Description |
|------|--------|-------------|
| 1 | 3c500b5 | PastebinSource + GistPasteSource with tests |
| 2 | ed148d4 | PasteSitesSource multi-paste aggregator with tests |
## Self-Check: PASSED
All 7 files found. Both commit hashes verified in git log.

View File

@@ -0,0 +1,221 @@
---
phase: 11-osint-search-paste
plan: 03
type: execute
wave: 2
depends_on: ["11-01", "11-02"]
files_modified:
- pkg/recon/sources/register.go
- pkg/recon/sources/register_test.go
- pkg/recon/sources/integration_test.go
- cmd/recon.go
autonomous: true
requirements: [RECON-DORK-01, RECON-DORK-02, RECON-DORK-03, RECON-PASTE-01]
must_haves:
truths:
- "RegisterAll wires all 8 new Phase 11 sources onto the recon engine alongside the 10 Phase 10 sources"
- "cmd/recon.go reads Google/Bing/Yandex/Brave API keys from env vars and viper config"
- "keyhunter recon list shows all 18 sources (10 Phase 10 + 8 Phase 11)"
- "Integration test with httptest fixtures proves SweepAll emits findings from all 18 source types"
- "Sources with missing credentials are registered but Enabled()==false"
artifacts:
- path: "pkg/recon/sources/register.go"
provides: "RegisterAll extended with Phase 11 sources"
contains: "GoogleDorkSource"
- path: "pkg/recon/sources/register_test.go"
provides: "Guardrail test asserting 18 sources registered"
contains: "18"
- path: "pkg/recon/sources/integration_test.go"
provides: "SweepAll integration test covering all 18 sources"
contains: "recon:google"
- path: "cmd/recon.go"
provides: "Credential wiring for search engine API keys"
contains: "GoogleAPIKey"
key_links:
- from: "pkg/recon/sources/register.go"
to: "pkg/recon/sources/google.go"
via: "RegisterAll calls engine.Register(GoogleDorkSource)"
pattern: "GoogleDorkSource"
- from: "cmd/recon.go"
to: "pkg/recon/sources/register.go"
via: "SourcesConfig credential fields"
pattern: "GoogleAPIKey|GoogleCX|BingAPIKey|YandexUser|YandexAPIKey|BraveAPIKey"
---
<objective>
Wire all 8 Phase 11 sources into RegisterAll, extend SourcesConfig with search engine credentials, update cmd/recon.go for env/viper credential lookup, and create the integration test proving all 18 sources work end-to-end via SweepAll.
Purpose: Complete Phase 11 by connecting all new sources to the engine and proving the full 18-source sweep works.
Output: Updated register.go, register_test.go, integration_test.go, cmd/recon.go.
</objective>
<execution_context>
@$HOME/.claude/get-shit-done/workflows/execute-plan.md
@$HOME/.claude/get-shit-done/templates/summary.md
</execution_context>
<context>
@.planning/PROJECT.md
@.planning/ROADMAP.md
@.planning/STATE.md
@pkg/recon/sources/register.go
@pkg/recon/sources/register_test.go
@pkg/recon/sources/integration_test.go
@cmd/recon.go
<interfaces>
From pkg/recon/sources/register.go (current):
```go
type SourcesConfig struct {
GitHubToken string
GitLabToken string
BitbucketToken string
BitbucketWorkspace string
CodebergToken string
HuggingFaceToken string
KaggleUser string
KaggleKey string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
}
func RegisterAll(engine *recon.Engine, cfg SourcesConfig)
```
From cmd/recon.go (current):
```go
func buildReconEngine() *recon.Engine // constructs SourcesConfig, calls RegisterAll
func firstNonEmpty(a, b string) string
```
New sources from Plan 11-01 (to be registered):
```go
type GoogleDorkSource struct { APIKey, CX, BaseURL string; Registry; Limiters; client }
type BingDorkSource struct { APIKey, BaseURL string; Registry; Limiters; client }
type DuckDuckGoSource struct { BaseURL string; Registry; Limiters; Client }
type YandexSource struct { User, APIKey, BaseURL string; Registry; Limiters; client }
type BraveSource struct { APIKey, BaseURL string; Registry; Limiters; client }
```
New sources from Plan 11-02 (to be registered):
```go
type PastebinSource struct { BaseURL string; Registry; Limiters; Client }
type GistPasteSource struct { BaseURL string; Registry; Limiters; Client }
type PasteSitesSource struct { Platforms; BaseURL string; Registry; Limiters; Client }
```
</interfaces>
</context>
<tasks>
<task type="auto" tdd="true">
<name>Task 1: Extend SourcesConfig + RegisterAll + cmd/recon.go credential wiring</name>
<files>pkg/recon/sources/register.go, pkg/recon/sources/register_test.go, cmd/recon.go</files>
<behavior>
- SourcesConfig gains 6 new fields: GoogleAPIKey, GoogleCX, BingAPIKey, YandexUser, YandexAPIKey, BraveAPIKey
- RegisterAll registers 18 sources total (10 Phase 10 + 8 Phase 11)
- RegisterAll with nil engine is still a no-op
- TestRegisterAll_WiresAllEighteenSources asserts eng.List() contains all 18 names sorted
- TestRegisterAll_MissingCredsStillRegistered asserts 18 sources with empty config
- buildReconEngine reads: GOOGLE_API_KEY / recon.google.api_key, GOOGLE_CX / recon.google.cx, BING_API_KEY / recon.bing.api_key, YANDEX_USER / recon.yandex.user, YANDEX_API_KEY / recon.yandex.api_key, BRAVE_API_KEY / recon.brave.api_key
- reconCmd Long description updated to mention Phase 11 sources
</behavior>
<action>
Update `pkg/recon/sources/register.go`:
- Add to SourcesConfig: GoogleAPIKey, GoogleCX, BingAPIKey, YandexUser, YandexAPIKey, BraveAPIKey (all string)
- Add Phase 11 registrations to RegisterAll after the Phase 10 block:
```
// Phase 11: Search engine dorking sources.
engine.Register(&GoogleDorkSource{APIKey: cfg.GoogleAPIKey, CX: cfg.GoogleCX, Registry: reg, Limiters: lim})
engine.Register(&BingDorkSource{APIKey: cfg.BingAPIKey, Registry: reg, Limiters: lim})
engine.Register(&DuckDuckGoSource{Registry: reg, Limiters: lim})
engine.Register(&YandexSource{User: cfg.YandexUser, APIKey: cfg.YandexAPIKey, Registry: reg, Limiters: lim})
engine.Register(&BraveSource{APIKey: cfg.BraveAPIKey, Registry: reg, Limiters: lim})
// Phase 11: Paste site sources.
engine.Register(&PastebinSource{Registry: reg, Limiters: lim})
engine.Register(&GistPasteSource{Registry: reg, Limiters: lim})
engine.Register(&PasteSitesSource{Registry: reg, Limiters: lim})
```
- Update doc comment on RegisterAll to say "Phase 10 + Phase 11" and total "18 sources"
Update `pkg/recon/sources/register_test.go`:
- TestRegisterAll_WiresAllEighteenSources: want list = sorted 18 names: ["bing", "bitbucket", "brave", "codeberg", "codesandbox", "duckduckgo", "gist", "gistpaste", "github", "gitlab", "google", "huggingface", "kaggle", "pastebin", "pastesites", "replit", "sandboxes", "yandex"]
- TestRegisterAll_MissingCredsStillRegistered: assert n == 18
Update `cmd/recon.go`:
- Add to SourcesConfig construction in buildReconEngine():
GoogleAPIKey: firstNonEmpty(os.Getenv("GOOGLE_API_KEY"), viper.GetString("recon.google.api_key")),
GoogleCX: firstNonEmpty(os.Getenv("GOOGLE_CX"), viper.GetString("recon.google.cx")),
BingAPIKey: firstNonEmpty(os.Getenv("BING_API_KEY"), viper.GetString("recon.bing.api_key")),
YandexUser: firstNonEmpty(os.Getenv("YANDEX_USER"), viper.GetString("recon.yandex.user")),
YandexAPIKey: firstNonEmpty(os.Getenv("YANDEX_API_KEY"), viper.GetString("recon.yandex.api_key")),
BraveAPIKey: firstNonEmpty(os.Getenv("BRAVE_API_KEY"), viper.GetString("recon.brave.api_key")),
- Update reconCmd.Long to list Phase 11 sources
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestRegisterAll" -v -count=1 && go build ./cmd/...</automated>
</verify>
<done>RegisterAll registers 18 sources. cmd/recon.go compiles with credential wiring. Guardrail tests pass.</done>
</task>
<task type="auto" tdd="true">
<name>Task 2: Integration test -- SweepAll across all 18 sources</name>
<files>pkg/recon/sources/integration_test.go</files>
<behavior>
- TestIntegration_AllSources_SweepAll registers all 18 sources with BaseURL overrides pointing at an httptest mux
- SweepAll returns findings from all 18 SourceType values
- Each SourceType (recon:github, recon:gitlab, ..., recon:google, recon:bing, recon:duckduckgo, recon:yandex, recon:brave, recon:pastebin, recon:gistpaste, recon:pastesites) has at least 1 finding
</behavior>
<action>
Update `pkg/recon/sources/integration_test.go`:
- Extend the existing httptest mux with handlers for the 8 new sources:
Google Custom Search: mux.HandleFunc("/customsearch/v1", ...) serves JSON `{"items":[{"link":"https://pastebin.com/abc123","title":"leak","snippet":"sk-proj-xxx"}]}`
Bing Web Search: mux.HandleFunc("/v7.0/search", ...) serves JSON `{"webPages":{"value":[{"url":"https://example.com/leak","name":"leak"}]}}`
DuckDuckGo HTML: mux.HandleFunc("/html/", ...) serves HTML with `<a class="result__a" href="https://example.com/ddg-leak">result</a>`
Yandex XML: mux.HandleFunc("/search/xml", ...) serves XML `<yandexsearch><response><results><grouping><group><doc><url>https://example.com/yandex-leak</url></doc></group></grouping></results></response></yandexsearch>`
Brave Search: mux.HandleFunc("/res/v1/web/search", ...) serves JSON `{"web":{"results":[{"url":"https://example.com/brave-leak","title":"leak"}]}}`
Pastebin search + raw: mux.HandleFunc("/pastebin-search", ...) serves HTML with paste links; mux.HandleFunc("/pastebin-raw/", ...) serves raw content with "sk-proj-ABC"
GistPaste search + raw: mux.HandleFunc("/gistpaste-search", ...) serves HTML with gist links; mux.HandleFunc("/gistpaste-raw/", ...) serves raw content with keyword
PasteSites: mux.HandleFunc("/pastesites-search", ...) + mux.HandleFunc("/pastesites-raw/", ...) similar pattern
Register all 18 sources on the engine with BaseURL=srv.URL, appropriate credentials for API sources (fake tokens). Then call eng.SweepAll and assert byType map has all 18 SourceType keys.
Update wantTypes to include: "recon:google", "recon:bing", "recon:duckduckgo", "recon:yandex", "recon:brave", "recon:pastebin", "recon:gistpaste", "recon:pastesites"
Keep the existing 10 Phase 10 source fixtures and registrations intact.
</action>
<verify>
<automated>cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestIntegration_AllSources" -v -count=1 -timeout=60s</automated>
</verify>
<done>Integration test proves SweepAll emits findings from all 18 sources. Full Phase 11 wiring confirmed end-to-end.</done>
</task>
</tasks>
<verification>
Full Phase 11 verification:
```bash
cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -v -count=1 -timeout=120s && go build ./cmd/...
```
</verification>
<success_criteria>
- RegisterAll registers 18 sources (10 Phase 10 + 8 Phase 11)
- cmd/recon.go compiles with all credential wiring
- Integration test passes with all 18 SourceTypes emitting findings
- `go build ./cmd/...` succeeds
- Guardrail test asserts exact 18-source name list
</success_criteria>
<output>
After completion, create `.planning/phases/11-osint_search_paste/11-03-SUMMARY.md`
</output>

View File

@@ -0,0 +1,99 @@
---
phase: 11-osint-search-paste
plan: 03
subsystem: recon
tags: [register-all, wiring, integration-test, credentials, search-engines, paste-sites]
requires:
- phase: 11-osint-search-paste
provides: GoogleDorkSource, BingDorkSource, DuckDuckGoSource, YandexSource, BraveSource (Plan 01)
- phase: 11-osint-search-paste
provides: PastebinSource, GistPasteSource, PasteSitesSource (Plan 02)
- phase: 10-osint-code-hosting
provides: RegisterAll, SourcesConfig, buildReconEngine, 10 Phase 10 sources
provides:
- RegisterAll extended to wire all 18 sources (Phase 10 + Phase 11)
- SourcesConfig with Google/Bing/Yandex/Brave credential fields
- cmd/recon.go credential wiring from env vars and viper config
- Integration test proving SweepAll across all 18 sources
affects: [12-osint-iot-cloud-storage, recon-registration, recon-engine]
tech-stack:
added: []
patterns: [per-source BaseURL prefix in integration tests to avoid path collisions]
key-files:
created: []
modified:
- pkg/recon/sources/register.go
- pkg/recon/sources/register_test.go
- pkg/recon/sources/integration_test.go
- cmd/recon.go
key-decisions:
- "Paste sources use BaseURL prefix (/pb/, /gp/) in integration test to avoid /search path collision with Replit/CodeSandbox"
- "PasteSites uses injected test platform in integration test, same pattern as SandboxesSource"
patterns-established:
- "Integration test BaseURL prefix pattern for sources sharing HTTP paths"
requirements-completed: [RECON-DORK-01, RECON-DORK-02, RECON-DORK-03, RECON-PASTE-01]
duration: 6min
completed: 2026-04-06
---
# Phase 11 Plan 03: RegisterAll Wiring + Integration Test Summary
**RegisterAll extended to 18 sources with search engine credential wiring and full SweepAll integration test**
## Performance
- **Duration:** 6 min
- **Started:** 2026-04-06T09:00:51Z
- **Completed:** 2026-04-06T09:06:34Z
- **Tasks:** 2
- **Files modified:** 4
## Accomplishments
- Extended SourcesConfig with 6 new credential fields (GoogleAPIKey, GoogleCX, BingAPIKey, YandexUser, YandexAPIKey, BraveAPIKey)
- RegisterAll now registers all 18 sources (10 Phase 10 + 8 Phase 11) unconditionally
- cmd/recon.go reads search engine API keys from env vars with viper config fallback
- Integration test proves SweepAll emits findings from all 18 SourceTypes via httptest fixtures
## Task Commits
Each task was committed atomically:
1. **Task 1: Extend SourcesConfig + RegisterAll + cmd/recon.go credential wiring** - `3250408` (feat)
2. **Task 2: Integration test -- SweepAll across all 18 sources** - `bebc3e7` (test)
## Files Created/Modified
- `pkg/recon/sources/register.go` - Extended SourcesConfig and RegisterAll with Phase 11 sources
- `pkg/recon/sources/register_test.go` - Guardrail tests updated to assert 18 sources
- `pkg/recon/sources/integration_test.go` - SweepAll integration test covering all 18 sources
- `cmd/recon.go` - Credential wiring for Google/Bing/Yandex/Brave API keys
## Decisions Made
- Paste sources use BaseURL prefix in integration test to avoid /search path collision with existing Replit/CodeSandbox handlers
- PasteSites uses injected test platform (same pattern as SandboxesSource) rather than default production platforms
## Deviations from Plan
None - plan executed exactly as written.
## Issues Encountered
None
## User Setup Required
None - no external service configuration required.
## Next Phase Readiness
- Phase 11 complete: all 18 OSINT sources (10 code-hosting + 5 search engine + 3 paste site) wired and tested
- Ready for Phase 12 (IoT/cloud storage sources) which will extend RegisterAll further
---
*Phase: 11-osint-search-paste*
*Completed: 2026-04-06*

View File

@@ -0,0 +1,42 @@
# Phase 11: OSINT Search Engines & Paste Sites - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for public search engine dorking (Google, Bing, DuckDuckGo, Yandex, Brave) and paste site scraping (Pastebin, GitHub Gist, Ghostbin, Rentry, ControlC) to detect leaked API keys across indexed web pages and public pastes.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- GoogleDorkSource — search engine dorking via Google search
- BingDorkSource — search engine dorking via Bing search
- DuckDuckGoSource — search via DuckDuckGo
- YandexSource — search via Yandex
- BraveSource — search via Brave Search API
- PastebinSource — scrape/search Pastebin for leaked keys
- GistSource — GitHub Gist paste aggregator for public gists
- GhostbinSource / RentrySource / ControlCSource — alternative paste site scrapers
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -0,0 +1,44 @@
# Phase 12: OSINT IoT/Device Search & Cloud Storage - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for internet-facing device search engines (Shodan, Censys, ZoomEye, FOFA, Netlas, BinaryEdge) and public cloud storage bucket scanners (AWS S3, GCS, Azure Blob, DigitalOcean Spaces) to find API keys exposed in device banners, configs, and misconfigured storage buckets.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- ShodanSource — search Shodan for exposed API keys in banners/configs
- CensysSource — search Censys for exposed services leaking keys
- ZoomEyeSource — search ZoomEye for device/service key exposure
- FOFASource — search FOFA for exposed endpoints with keys
- NetlasSource — search Netlas for internet-wide scan results
- BinaryEdgeSource — search BinaryEdge for exposed services
- S3Scanner — scan publicly accessible AWS S3 buckets for key files
- GCSScanner — scan publicly accessible Google Cloud Storage buckets
- AzureBlobScanner — scan publicly accessible Azure Blob containers
- DigitalOceanSpaces — scan publicly accessible DO Spaces
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -0,0 +1,45 @@
# Phase 13: OSINT Package Registries, Containers & IaC - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for package registry searches (npm, PyPI, Crates.io, RubyGems, Maven, NuGet, Go Proxy), container image inspection (Docker Hub, Docker Compose files), and infrastructure-as-code sources (Kubernetes configs, Terraform Registry) to detect API keys embedded in published packages, images, and IaC definitions.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- NpmSource — search npm registry for packages leaking API keys
- PyPISource — search PyPI for packages with embedded keys
- CratesIOSource — search Crates.io for Rust packages with key leaks
- RubyGemsSource — search RubyGems for gems with exposed keys
- MavenSource — search Maven Central for Java artifacts with keys
- NuGetSource — search NuGet for .NET packages with key exposure
- GoProxySource — search Go module proxy for modules with keys
- ComposeSource — scan Docker Compose files for hardcoded keys
- DockerHubSource — inspect public Docker Hub images for embedded keys
- KubernetesConfigSource — scan public Kubernetes configs/manifests for secrets
- TerraformRegistrySource — search Terraform Registry modules for leaked keys
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -0,0 +1,45 @@
# Phase 14: OSINT CI/CD Logs, Web Archives & Frontend Leaks - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for CI/CD build log scraping (Travis CI, GitHub Actions, CircleCI, Jenkins), web archive searching (Wayback Machine, Common Crawl), and frontend asset analysis (JS bundles, source maps, env file leaks, Webpack/Next.js builds) to detect API keys leaked in build outputs, archived pages, and client-side code.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- TravisCISource — scrape public Travis CI build logs for leaked keys
- GitHubActionsSource — search GitHub Actions workflow logs for key exposure
- CircleCISource — scrape public CircleCI build logs
- JenkinsSource — scrape publicly accessible Jenkins build consoles
- WaybackMachineSource — search Wayback Machine snapshots for historical key leaks
- CommonCrawlSource — search Common Crawl index for pages containing keys
- JSBundleSource — analyze public JavaScript bundles for embedded API keys
- SourceMapSource — parse source maps to recover original source with keys
- EnvLeakSource — detect publicly accessible .env files on web servers
- WebpackSource — analyze Webpack chunk manifests for key exposure
- NextJSSource — analyze Next.js build artifacts for leaked server-side keys
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -0,0 +1,47 @@
# Phase 15: OSINT Forums, Collaboration Tools & Log Aggregators - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for developer forums (Stack Overflow, Reddit, Hacker News), collaboration platforms (Discord, Slack, Trello, Notion, Confluence), and log/monitoring aggregators (Elasticsearch, Grafana, Sentry, Kibana, Splunk) to detect API keys shared in public discussions, workspace leaks, and exposed logging dashboards.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- StackOverflowSource — search Stack Overflow posts/answers for leaked keys
- RedditSource — search Reddit posts/comments for key exposure
- HackerNewsSource — search Hacker News submissions/comments for keys
- DiscordSource — search public Discord servers/channels for leaked keys
- SlackSource — search publicly indexed Slack messages for keys
- TrelloSource — search public Trello boards for exposed credentials
- NotionSource — search publicly shared Notion pages for keys
- ConfluenceSource — search publicly accessible Confluence wikis for keys
- ElasticsearchSource — search exposed Elasticsearch instances for key data
- GrafanaSource — search publicly accessible Grafana dashboards for keys
- SentrySource — search exposed Sentry instances for leaked keys in error reports
- KibanaSource — search publicly accessible Kibana dashboards for key data
- SplunkSource — search exposed Splunk instances for key leaks in logs
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -0,0 +1,43 @@
# Phase 16: OSINT Threat Intel, Mobile, DNS & API Marketplaces - Context
**Gathered:** 2026-04-06
**Status:** Ready for planning
**Mode:** Auto-generated
<domain>
## Phase Boundary
Adds ReconSource implementations for threat intelligence platforms (VirusTotal, IntelligenceX, URLScan), mobile app analysis (APKMirror), DNS/certificate transparency (crt.sh, SecurityTrails), and API marketplaces/documentation hubs (Postman, SwaggerHub, RapidAPI) to detect API keys exposed in threat feeds, mobile binaries, certificate records, and public API collections.
</domain>
<decisions>
## Implementation Decisions
### Claude's Discretion
All implementation choices are at Claude's discretion. Follow the established Phase 10 pattern: each source implements recon.ReconSource, uses pkg/recon/sources/httpclient.go for HTTP, uses httptest for tests. Each source goes in its own file.
</decisions>
<code_context>
## Existing Code Insights
### Reusable Assets
- pkg/recon/sources/ — established source implementation pattern from Phase 10
- pkg/recon/sources/httpclient.go — shared retry HTTP client
- pkg/recon/sources/register.go — RegisterAll (extend per phase)
- pkg/recon/source.go — ReconSource interface
</code_context>
<specifics>
## Specific Ideas
- VirusTotalSource — search VirusTotal for samples/URLs containing API keys
- IntelligenceXSource — search IntelligenceX archives for leaked credentials
- URLScanSource — search urlscan.io scan results for exposed keys
- APKMirrorSource — download and analyze APK files for embedded API keys
- CrtShSource — search crt.sh certificate transparency logs for key-related domains
- SecurityTrailsSource — search SecurityTrails DNS/historical data for key exposure
- PostmanSource — search public Postman collections/workspaces for API keys
- SwaggerHubSource — search public SwaggerHub API definitions for embedded keys
- RapidAPISource — search RapidAPI public listings for exposed credentials
</specifics>
<deferred>
## Deferred Ideas
None — straightforward source implementations.
</deferred>

View File

@@ -26,7 +26,7 @@ var (
var reconCmd = &cobra.Command{
Use: "recon",
Short: "Run OSINT recon across internet sources",
Long: "Run OSINT recon sweeps across registered sources. Phase 10 adds ten code-hosting sources (GitHub/GitLab/Bitbucket/Gist/Codeberg/HuggingFace/Replit/CodeSandbox/Sandboxes/Kaggle). Further phases add pastebins, search engines, etc.",
Long: "Run OSINT recon sweeps across registered sources. Phase 10 adds ten code-hosting sources (GitHub/GitLab/Bitbucket/Gist/Codeberg/HuggingFace/Replit/CodeSandbox/Sandboxes/Kaggle). Phase 11 adds search engine dorking (Google/Bing/DuckDuckGo/Yandex/Brave) and paste site scanning (Pastebin/GistPaste/PasteSites).",
}
var reconFullCmd = &cobra.Command{
@@ -153,6 +153,12 @@ func buildReconEngine() *recon.Engine {
HuggingFaceToken: firstNonEmpty(os.Getenv("HUGGINGFACE_TOKEN"), viper.GetString("recon.huggingface.token")),
KaggleUser: firstNonEmpty(os.Getenv("KAGGLE_USERNAME"), viper.GetString("recon.kaggle.username")),
KaggleKey: firstNonEmpty(os.Getenv("KAGGLE_KEY"), viper.GetString("recon.kaggle.key")),
GoogleAPIKey: firstNonEmpty(os.Getenv("GOOGLE_API_KEY"), viper.GetString("recon.google.api_key")),
GoogleCX: firstNonEmpty(os.Getenv("GOOGLE_CX"), viper.GetString("recon.google.cx")),
BingAPIKey: firstNonEmpty(os.Getenv("BING_API_KEY"), viper.GetString("recon.bing.api_key")),
YandexUser: firstNonEmpty(os.Getenv("YANDEX_USER"), viper.GetString("recon.yandex.user")),
YandexAPIKey: firstNonEmpty(os.Getenv("YANDEX_API_KEY"), viper.GetString("recon.yandex.api_key")),
BraveAPIKey: firstNonEmpty(os.Getenv("BRAVE_API_KEY"), viper.GetString("recon.brave.api_key")),
}
sources.RegisterAll(e, cfg)
return e

155
pkg/recon/sources/bing.go Normal file
View File

@@ -0,0 +1,155 @@
package sources
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// BingDorkSource implements recon.ReconSource against the Bing Web Search
// API v7. It iterates provider keyword queries and emits a Finding per result.
//
// A missing API key disables the source without error.
type BingDorkSource struct {
APIKey string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// Compile-time assertion.
var _ recon.ReconSource = (*BingDorkSource)(nil)
// NewBingDorkSource constructs a BingDorkSource with the shared retry client.
func NewBingDorkSource(apiKey string, reg *providers.Registry, lim *recon.LimiterRegistry) *BingDorkSource {
return &BingDorkSource{
APIKey: apiKey,
BaseURL: "https://api.bing.microsoft.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
}
}
func (s *BingDorkSource) Name() string { return "bing" }
func (s *BingDorkSource) RateLimit() rate.Limit { return rate.Every(500 * time.Millisecond) }
func (s *BingDorkSource) Burst() int { return 2 }
func (s *BingDorkSource) RespectsRobots() bool { return false }
// Enabled returns true only when APIKey is configured.
func (s *BingDorkSource) Enabled(_ recon.Config) bool { return s.APIKey != "" }
// Sweep issues one Bing Web Search request per provider keyword and emits a
// Finding for every webPages.value result.
func (s *BingDorkSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.APIKey == "" {
return nil
}
base := s.BaseURL
if base == "" {
base = "https://api.bing.microsoft.com"
}
queries := BuildQueries(s.Registry, "bing")
kwIndex := bingKeywordIndex(s.Registry)
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
endpoint := fmt.Sprintf("%s/v7.0/search?q=%s&count=50", base, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("bing: build request: %w", err)
}
req.Header.Set("Ocp-Apim-Subscription-Key", s.APIKey)
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "keyhunter-recon")
resp, err := s.client.Do(ctx, req)
if err != nil {
if errors.Is(err, ErrUnauthorized) {
return err
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
continue
}
var parsed bingSearchResponse
decErr := json.NewDecoder(resp.Body).Decode(&parsed)
_ = resp.Body.Close()
if decErr != nil {
continue
}
provName := kwIndex[strings.ToLower(extractGoogleKeyword(q))]
for _, it := range parsed.WebPages.Value {
f := recon.Finding{
ProviderName: provName,
Confidence: "low",
Source: it.URL,
SourceType: "recon:bing",
DetectedAt: time.Now(),
}
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
}
return nil
}
type bingSearchResponse struct {
WebPages bingWebPages `json:"webPages"`
}
type bingWebPages struct {
Value []bingWebResult `json:"value"`
}
type bingWebResult struct {
Name string `json:"name"`
URL string `json:"url"`
Snippet string `json:"snippet"`
}
// bingKeywordIndex maps lowercased keywords to provider names.
func bingKeywordIndex(reg *providers.Registry) map[string]string {
m := make(map[string]string)
if reg == nil {
return m
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
kl := strings.ToLower(strings.TrimSpace(k))
if kl == "" {
continue
}
if _, exists := m[kl]; !exists {
m[kl] = p.Name
}
}
}
return m
}

View File

@@ -0,0 +1,146 @@
package sources
import (
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func bingStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
t.Helper()
return func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(calls, 1)
if !strings.HasPrefix(r.URL.Path, "/v7.0/search") {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if got := r.Header.Get("Ocp-Apim-Subscription-Key"); got != "testkey" {
t.Errorf("missing subscription key header: %q", got)
}
body := map[string]any{
"webPages": map[string]any{
"value": []map[string]any{
{"name": "result1", "url": "https://pastebin.com/xyz789", "snippet": "found"},
{"name": "result2", "url": "https://github.com/user/repo/blob/main/.env", "snippet": "key"},
{"name": "result3", "url": "https://example.com/leak", "snippet": "data"},
},
},
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(body)
}
}
func TestBingDorkSource_EnabledRequiresAPIKey(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
if s := NewBingDorkSource("", reg, lim); s.Enabled(recon.Config{}) {
t.Error("expected Enabled=false with empty key")
}
if s := NewBingDorkSource("key", reg, lim); !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true with key")
}
}
func TestBingDorkSource_SweepEmptyKeyReturnsNil(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
s := NewBingDorkSource("", reg, lim)
out := make(chan recon.Finding, 10)
if err := s.Sweep(context.Background(), "", out); err != nil {
t.Fatalf("expected nil, got %v", err)
}
close(out)
if n := countFindings(out); n != 0 {
t.Fatalf("expected 0 findings, got %d", n)
}
}
func TestBingDorkSource_SweepEmitsFindings(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("bing", 1000, 100)
var calls int32
srv := httptest.NewServer(bingStubHandler(t, &calls))
defer srv.Close()
s := NewBingDorkSource("testkey", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
done := make(chan error, 1)
go func() { done <- s.Sweep(ctx, "", out); close(out) }()
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if err := <-done; err != nil {
t.Fatalf("Sweep error: %v", err)
}
// 2 keywords * 3 items = 6 findings
if len(findings) != 6 {
t.Fatalf("expected 6 findings, got %d", len(findings))
}
for _, f := range findings {
if f.SourceType != "recon:bing" {
t.Errorf("SourceType=%q want recon:bing", f.SourceType)
}
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 calls, got %d", got)
}
}
func TestBingDorkSource_CtxCancelled(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("bing", 1000, 100)
s := NewBingDorkSource("key", reg, lim)
s.BaseURL = "http://127.0.0.1:1"
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 1)
err := s.Sweep(ctx, "", out)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got %v", err)
}
}
func TestBingDorkSource_Unauthorized(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("bing", 1000, 100)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte("invalid key"))
}))
defer srv.Close()
s := NewBingDorkSource("key", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 1)
err := s.Sweep(context.Background(), "", out)
if !errors.Is(err, ErrUnauthorized) {
t.Fatalf("expected ErrUnauthorized, got %v", err)
}
}

153
pkg/recon/sources/brave.go Normal file
View File

@@ -0,0 +1,153 @@
package sources
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// BraveSource implements recon.ReconSource against the Brave Search API.
// It requires an API key (X-Subscription-Token) to be enabled.
type BraveSource struct {
APIKey string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// Compile-time assertion.
var _ recon.ReconSource = (*BraveSource)(nil)
// NewBraveSource constructs a BraveSource with the shared retry client.
func NewBraveSource(apiKey string, reg *providers.Registry, lim *recon.LimiterRegistry) *BraveSource {
return &BraveSource{
APIKey: apiKey,
BaseURL: "https://api.search.brave.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
}
}
func (s *BraveSource) Name() string { return "brave" }
func (s *BraveSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
func (s *BraveSource) Burst() int { return 1 }
func (s *BraveSource) RespectsRobots() bool { return false }
// Enabled returns true only when APIKey is configured.
func (s *BraveSource) Enabled(_ recon.Config) bool { return s.APIKey != "" }
// Sweep issues one Brave Search request per provider keyword and emits a
// Finding for every web result.
func (s *BraveSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.APIKey == "" {
return nil
}
base := s.BaseURL
if base == "" {
base = "https://api.search.brave.com"
}
queries := BuildQueries(s.Registry, "brave")
kwIndex := braveKeywordIndex(s.Registry)
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
endpoint := fmt.Sprintf("%s/res/v1/web/search?q=%s&count=20", base, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("brave: build request: %w", err)
}
req.Header.Set("X-Subscription-Token", s.APIKey)
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "keyhunter-recon")
resp, err := s.client.Do(ctx, req)
if err != nil {
if errors.Is(err, ErrUnauthorized) {
return err
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
continue
}
var parsed braveSearchResponse
decErr := json.NewDecoder(resp.Body).Decode(&parsed)
_ = resp.Body.Close()
if decErr != nil {
continue
}
provName := kwIndex[strings.ToLower(extractGoogleKeyword(q))]
for _, it := range parsed.Web.Results {
f := recon.Finding{
ProviderName: provName,
Confidence: "low",
Source: it.URL,
SourceType: "recon:brave",
DetectedAt: time.Now(),
}
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
}
return nil
}
type braveSearchResponse struct {
Web braveWebResults `json:"web"`
}
type braveWebResults struct {
Results []braveWebItem `json:"results"`
}
type braveWebItem struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
}
// braveKeywordIndex maps lowercased keywords to provider names.
func braveKeywordIndex(reg *providers.Registry) map[string]string {
m := make(map[string]string)
if reg == nil {
return m
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
kl := strings.ToLower(strings.TrimSpace(k))
if kl == "" {
continue
}
if _, exists := m[kl]; !exists {
m[kl] = p.Name
}
}
}
return m
}

View File

@@ -0,0 +1,145 @@
package sources
import (
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func braveStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
t.Helper()
return func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(calls, 1)
if !strings.HasPrefix(r.URL.Path, "/res/v1/web/search") {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if got := r.Header.Get("X-Subscription-Token"); got != "testtoken" {
t.Errorf("missing subscription token: %q", got)
}
body := map[string]any{
"web": map[string]any{
"results": []map[string]any{
{"url": "https://pastebin.com/brave1", "title": "Brave Result 1", "description": "found key"},
{"url": "https://github.com/org/repo/blob/main/config.env", "title": "Brave Result 2", "description": "leaked"},
},
},
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(body)
}
}
func TestBraveSource_EnabledRequiresAPIKey(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
if s := NewBraveSource("", reg, lim); s.Enabled(recon.Config{}) {
t.Error("expected Enabled=false with empty key")
}
if s := NewBraveSource("key", reg, lim); !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true with key")
}
}
func TestBraveSource_SweepEmptyKeyReturnsNil(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
s := NewBraveSource("", reg, lim)
out := make(chan recon.Finding, 10)
if err := s.Sweep(context.Background(), "", out); err != nil {
t.Fatalf("expected nil, got %v", err)
}
close(out)
if n := countFindings(out); n != 0 {
t.Fatalf("expected 0 findings, got %d", n)
}
}
func TestBraveSource_SweepEmitsFindings(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("brave", 1000, 100)
var calls int32
srv := httptest.NewServer(braveStubHandler(t, &calls))
defer srv.Close()
s := NewBraveSource("testtoken", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
done := make(chan error, 1)
go func() { done <- s.Sweep(ctx, "", out); close(out) }()
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if err := <-done; err != nil {
t.Fatalf("Sweep error: %v", err)
}
// 2 keywords * 2 items = 4 findings
if len(findings) != 4 {
t.Fatalf("expected 4 findings, got %d", len(findings))
}
for _, f := range findings {
if f.SourceType != "recon:brave" {
t.Errorf("SourceType=%q want recon:brave", f.SourceType)
}
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 calls, got %d", got)
}
}
func TestBraveSource_CtxCancelled(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("brave", 1000, 100)
s := NewBraveSource("key", reg, lim)
s.BaseURL = "http://127.0.0.1:1"
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 1)
err := s.Sweep(ctx, "", out)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got %v", err)
}
}
func TestBraveSource_Unauthorized(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("brave", 1000, 100)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte("bad token"))
}))
defer srv.Close()
s := NewBraveSource("key", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 1)
err := s.Sweep(context.Background(), "", out)
if !errors.Is(err, ErrUnauthorized) {
t.Fatalf("expected ErrUnauthorized, got %v", err)
}
}

View File

@@ -0,0 +1,116 @@
package sources
import (
"context"
"fmt"
"net/http"
"net/url"
"regexp"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// DuckDuckGoSource implements recon.ReconSource by scraping DuckDuckGo's HTML
// search endpoint. No API key is required -- this source is always enabled.
//
// It operates conservatively (2s per request) and declares RespectsRobots=true.
type DuckDuckGoSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// Compile-time assertion.
var _ recon.ReconSource = (*DuckDuckGoSource)(nil)
// ddgResultRE matches DuckDuckGo HTML result links. The HTML search page uses
// <a class="result__a" href="..."> anchors for organic results.
var ddgResultRE = regexp.MustCompile(`^https?://`)
// NewDuckDuckGoSource constructs a DuckDuckGoSource with the shared retry client.
func NewDuckDuckGoSource(reg *providers.Registry, lim *recon.LimiterRegistry) *DuckDuckGoSource {
return &DuckDuckGoSource{
BaseURL: "https://html.duckduckgo.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
}
}
func (s *DuckDuckGoSource) Name() string { return "duckduckgo" }
func (s *DuckDuckGoSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
func (s *DuckDuckGoSource) Burst() int { return 1 }
func (s *DuckDuckGoSource) RespectsRobots() bool { return true }
// Enabled always returns true -- DuckDuckGo HTML scraping requires no credentials.
func (s *DuckDuckGoSource) Enabled(_ recon.Config) bool { return true }
// Sweep iterates provider keywords, scrapes DuckDuckGo HTML search, and emits
// a Finding per result link.
func (s *DuckDuckGoSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
base = "https://html.duckduckgo.com"
}
client := s.client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "duckduckgo")
if len(queries) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
searchURL := fmt.Sprintf("%s/html/?q=%s", base, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return fmt.Errorf("duckduckgo: build req: %w", err)
}
req.Header.Set("User-Agent", "keyhunter-recon")
resp, err := client.Do(ctx, req)
if err != nil {
// Transient failures: continue to next query.
continue
}
links, parseErr := extractAnchorHrefs(resp.Body, ddgResultRE)
_ = resp.Body.Close()
if parseErr != nil {
continue
}
for _, href := range links {
if err := ctx.Err(); err != nil {
return err
}
f := recon.Finding{
Source: href,
SourceType: "recon:duckduckgo",
Confidence: "low",
DetectedAt: time.Now(),
}
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
}
return nil
}

View File

@@ -0,0 +1,134 @@
package sources
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
const ddgHTMLFixture = `<!DOCTYPE html>
<html>
<body>
<div class="results">
<div class="result">
<a class="result__a" href="https://pastebin.com/abc123">Pastebin Result</a>
</div>
<div class="result">
<a class="result__a" href="https://github.com/user/repo/blob/main/.env">GitHub Result</a>
</div>
<div class="result">
<a class="result__a" href="https://example.com/page">Example</a>
</div>
</div>
</body>
</html>`
func ddgStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
t.Helper()
return func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(calls, 1)
if r.URL.Path != "/html/" {
t.Errorf("unexpected path: %s", r.URL.Path)
}
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(ddgHTMLFixture))
}
}
func TestDuckDuckGoSource_AlwaysEnabled(t *testing.T) {
s := NewDuckDuckGoSource(syntheticRegistry(), recon.NewLimiterRegistry())
if !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true always")
}
}
func TestDuckDuckGoSource_RespectsRobots(t *testing.T) {
s := NewDuckDuckGoSource(syntheticRegistry(), recon.NewLimiterRegistry())
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
}
func TestDuckDuckGoSource_SweepEmitsFindings(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("duckduckgo", 1000, 100)
var calls int32
srv := httptest.NewServer(ddgStubHandler(t, &calls))
defer srv.Close()
s := NewDuckDuckGoSource(reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
done := make(chan error, 1)
go func() { done <- s.Sweep(ctx, "", out); close(out) }()
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if err := <-done; err != nil {
t.Fatalf("Sweep error: %v", err)
}
// 2 keywords * 3 links = 6 findings
if len(findings) != 6 {
t.Fatalf("expected 6 findings, got %d", len(findings))
}
for _, f := range findings {
if f.SourceType != "recon:duckduckgo" {
t.Errorf("SourceType=%q want recon:duckduckgo", f.SourceType)
}
if f.Confidence != "low" {
t.Errorf("Confidence=%q want low", f.Confidence)
}
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 DDG calls, got %d", got)
}
}
func TestDuckDuckGoSource_CtxCancelled(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("duckduckgo", 1000, 100)
s := NewDuckDuckGoSource(reg, lim)
s.BaseURL = "http://127.0.0.1:1"
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 1)
err := s.Sweep(ctx, "", out)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got %v", err)
}
}
func TestDuckDuckGoSource_EmptyRegistryNoError(t *testing.T) {
lim := recon.NewLimiterRegistry()
s := NewDuckDuckGoSource(nil, lim)
out := make(chan recon.Finding, 1)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
t.Fatal("should not be called with nil registry")
}))
defer srv.Close()
s.BaseURL = srv.URL
if err := s.Sweep(context.Background(), "", out); err != nil {
t.Fatalf("expected nil, got %v", err)
}
}

View File

@@ -0,0 +1,152 @@
package sources
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// GistPasteSource scrapes gist.github.com's public search (no auth required)
// for API key leaks. This is distinct from Phase 10's GistSource which uses
// the authenticated GitHub API.
//
// Auth: none. Rate: Every(3s), Burst 1.
type GistPasteSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
// gistPasteLinkRE matches gist links: /<user>/<hex-hash>
var gistPasteLinkRE = regexp.MustCompile(`^/[^/]+/[a-f0-9]+$`)
// Compile-time assertion.
var _ recon.ReconSource = (*GistPasteSource)(nil)
func (s *GistPasteSource) Name() string { return "gistpaste" }
func (s *GistPasteSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *GistPasteSource) Burst() int { return 1 }
func (s *GistPasteSource) RespectsRobots() bool { return true }
// Enabled always returns true: gist search scraping requires no credentials.
func (s *GistPasteSource) Enabled(_ recon.Config) bool { return true }
// Sweep searches gist.github.com for each provider keyword, fetches raw gist
// content, and emits Findings for keyword matches.
func (s *GistPasteSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
base = "https://gist.github.com"
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "gistpaste")
if len(queries) == 0 {
return nil
}
keywords := gistPasteKeywordSet(s.Registry)
if len(keywords) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return fmt.Errorf("gistpaste: build search req: %w", err)
}
resp, err := client.Do(ctx, req)
if err != nil {
return fmt.Errorf("gistpaste: search fetch: %w", err)
}
links, err := extractAnchorHrefs(resp.Body, gistPasteLinkRE)
_ = resp.Body.Close()
if err != nil {
return fmt.Errorf("gistpaste: parse search html: %w", err)
}
for _, gistPath := range links {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
rawURL := fmt.Sprintf("%s%s/raw", base, gistPath)
rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return fmt.Errorf("gistpaste: build raw req: %w", err)
}
rawResp, err := client.Do(ctx, rawReq)
if err != nil {
continue // skip this gist on error
}
body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024))
_ = rawResp.Body.Close()
if readErr != nil {
continue
}
content := string(body)
for kw, provName := range keywords {
if strings.Contains(content, kw) {
out <- recon.Finding{
ProviderName: provName,
Source: fmt.Sprintf("%s%s", base, gistPath),
SourceType: "recon:gistpaste",
Confidence: "low",
DetectedAt: time.Now(),
}
break // one finding per gist
}
}
}
}
return nil
}
// gistPasteKeywordSet builds keyword->providerName map from registry.
func gistPasteKeywordSet(reg *providers.Registry) map[string]string {
out := make(map[string]string)
if reg == nil {
return out
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
if k == "" {
continue
}
if _, ok := out[k]; !ok {
out[k] = p.Name
}
}
}
return out
}

View File

@@ -0,0 +1,119 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func gistPasteTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "anthropic", Keywords: []string{"sk-ant-"}},
})
}
const gistPasteSearchHTML = `<!doctype html>
<html><body>
<a href="/alice/abc123def456">gist one</a>
<a href="/bob/789aaa000bbb">gist two</a>
<a href="/about">nope</a>
<a href="/trending">nope</a>
</body></html>`
const gistPasteRaw1 = `config with sk-ant-XYZKEY123 inside`
const gistPasteRaw2 = `nothing here`
func TestGistPaste_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/search":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(gistPasteSearchHTML))
case r.URL.Path == "/alice/abc123def456/raw":
_, _ = w.Write([]byte(gistPasteRaw1))
case r.URL.Path == "/bob/789aaa000bbb/raw":
_, _ = w.Write([]byte(gistPasteRaw2))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
src := &GistPasteSource{
BaseURL: srv.URL,
Registry: gistPasteTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 16)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if len(findings) != 1 {
t.Fatalf("expected 1 finding, got %d", len(findings))
}
f := findings[0]
if f.SourceType != "recon:gistpaste" {
t.Errorf("SourceType=%s, want recon:gistpaste", f.SourceType)
}
if f.ProviderName != "anthropic" {
t.Errorf("ProviderName=%s, want anthropic", f.ProviderName)
}
wantSource := srv.URL + "/alice/abc123def456"
if f.Source != wantSource {
t.Errorf("Source=%s, want %s", f.Source, wantSource)
}
}
func TestGistPaste_NameAndRate(t *testing.T) {
s := &GistPasteSource{}
if s.Name() != "gistpaste" {
t.Errorf("Name=%s", s.Name())
}
if s.Burst() != 1 {
t.Errorf("Burst=%d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
if !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true")
}
}
func TestGistPaste_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(gistPasteSearchHTML))
}))
defer srv.Close()
src := &GistPasteSource{
BaseURL: srv.URL,
Registry: gistPasteTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}

172
pkg/recon/sources/google.go Normal file
View File

@@ -0,0 +1,172 @@
package sources
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// GoogleDorkSource implements recon.ReconSource against the Google Custom
// Search JSON API. It iterates provider keyword queries (via BuildQueries)
// and emits a recon.Finding for every search result item returned.
//
// Both APIKey and CX (custom search engine ID) must be set for the source to
// be enabled. Missing credentials disable the source without error.
type GoogleDorkSource struct {
APIKey string
CX string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// Compile-time assertion.
var _ recon.ReconSource = (*GoogleDorkSource)(nil)
// NewGoogleDorkSource constructs a GoogleDorkSource with the shared retry client.
func NewGoogleDorkSource(apiKey, cx string, reg *providers.Registry, lim *recon.LimiterRegistry) *GoogleDorkSource {
return &GoogleDorkSource{
APIKey: apiKey,
CX: cx,
BaseURL: "https://www.googleapis.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
}
}
func (s *GoogleDorkSource) Name() string { return "google" }
func (s *GoogleDorkSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
func (s *GoogleDorkSource) Burst() int { return 1 }
func (s *GoogleDorkSource) RespectsRobots() bool { return false }
// Enabled returns true only when both APIKey and CX are configured.
func (s *GoogleDorkSource) Enabled(_ recon.Config) bool {
return s.APIKey != "" && s.CX != ""
}
// Sweep issues one Custom Search request per provider keyword and emits a
// Finding for every result item.
func (s *GoogleDorkSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.APIKey == "" || s.CX == "" {
return nil
}
base := s.BaseURL
if base == "" {
base = "https://www.googleapis.com"
}
queries := BuildQueries(s.Registry, "google")
kwIndex := googleKeywordIndex(s.Registry)
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
endpoint := fmt.Sprintf("%s/customsearch/v1?key=%s&cx=%s&q=%s&num=10",
base, url.QueryEscape(s.APIKey), url.QueryEscape(s.CX), url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("google: build request: %w", err)
}
req.Header.Set("Accept", "application/json")
req.Header.Set("User-Agent", "keyhunter-recon")
resp, err := s.client.Do(ctx, req)
if err != nil {
if errors.Is(err, ErrUnauthorized) {
return err
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
continue
}
var parsed googleSearchResponse
decErr := json.NewDecoder(resp.Body).Decode(&parsed)
_ = resp.Body.Close()
if decErr != nil {
continue
}
provName := kwIndex[strings.ToLower(extractGoogleKeyword(q))]
for _, it := range parsed.Items {
f := recon.Finding{
ProviderName: provName,
Confidence: "low",
Source: it.Link,
SourceType: "recon:google",
DetectedAt: time.Now(),
}
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
}
return nil
}
type googleSearchResponse struct {
Items []googleSearchItem `json:"items"`
}
type googleSearchItem struct {
Title string `json:"title"`
Link string `json:"link"`
Snippet string `json:"snippet"`
}
// googleKeywordIndex maps lowercased keywords to provider names.
func googleKeywordIndex(reg *providers.Registry) map[string]string {
m := make(map[string]string)
if reg == nil {
return m
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
kl := strings.ToLower(strings.TrimSpace(k))
if kl == "" {
continue
}
if _, exists := m[kl]; !exists {
m[kl] = p.Name
}
}
}
return m
}
// extractGoogleKeyword reverses the dork query format to recover the keyword.
func extractGoogleKeyword(q string) string {
// Format: site:pastebin.com OR site:github.com "keyword"
idx := strings.LastIndex(q, `"`)
if idx <= 0 {
return q
}
inner := q[:idx]
start := strings.LastIndex(inner, `"`)
if start < 0 {
return q
}
return inner[start+1:]
}

View File

@@ -0,0 +1,158 @@
package sources
import (
"context"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func googleStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
t.Helper()
return func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(calls, 1)
if !strings.HasPrefix(r.URL.Path, "/customsearch/v1") {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.URL.Query().Get("key") != "testkey" {
t.Errorf("missing api key in query")
}
if r.URL.Query().Get("cx") != "testcx" {
t.Errorf("missing cx in query")
}
body := map[string]any{
"items": []map[string]any{
{"title": "result1", "link": "https://pastebin.com/abc123", "snippet": "found key"},
{"title": "result2", "link": "https://github.com/org/repo/blob/main/env", "snippet": "another"},
},
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(body)
}
}
func TestGoogleDorkSource_EnabledRequiresBothKeys(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
tests := []struct {
apiKey, cx string
want bool
}{
{"", "", false},
{"key", "", false},
{"", "cx", false},
{"key", "cx", true},
}
for _, tt := range tests {
s := NewGoogleDorkSource(tt.apiKey, tt.cx, reg, lim)
if got := s.Enabled(recon.Config{}); got != tt.want {
t.Errorf("Enabled(apiKey=%q, cx=%q) = %v, want %v", tt.apiKey, tt.cx, got, tt.want)
}
}
}
func TestGoogleDorkSource_SweepEmptyCredsReturnsNil(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
s := NewGoogleDorkSource("", "", reg, lim)
out := make(chan recon.Finding, 10)
if err := s.Sweep(context.Background(), "", out); err != nil {
t.Fatalf("expected nil err, got %v", err)
}
close(out)
if n := countFindings(out); n != 0 {
t.Fatalf("expected 0 findings, got %d", n)
}
}
func TestGoogleDorkSource_SweepEmitsFindings(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("google", 1000, 100)
var calls int32
srv := httptest.NewServer(googleStubHandler(t, &calls))
defer srv.Close()
s := NewGoogleDorkSource("testkey", "testcx", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
done := make(chan error, 1)
go func() { done <- s.Sweep(ctx, "", out); close(out) }()
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if err := <-done; err != nil {
t.Fatalf("Sweep error: %v", err)
}
// 2 keywords * 2 items = 4 findings
if len(findings) != 4 {
t.Fatalf("expected 4 findings, got %d", len(findings))
}
for _, f := range findings {
if f.SourceType != "recon:google" {
t.Errorf("SourceType=%q want recon:google", f.SourceType)
}
if f.Confidence != "low" {
t.Errorf("Confidence=%q want low", f.Confidence)
}
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 API calls, got %d", got)
}
}
func TestGoogleDorkSource_CtxCancelled(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("google", 1000, 100)
s := NewGoogleDorkSource("key", "cx", reg, lim)
s.BaseURL = "http://127.0.0.1:1"
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 1)
err := s.Sweep(ctx, "", out)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got %v", err)
}
}
func TestGoogleDorkSource_Unauthorized(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("google", 1000, 100)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte("bad key"))
}))
defer srv.Close()
s := NewGoogleDorkSource("key", "cx", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 1)
err := s.Sweep(context.Background(), "", out)
if !errors.Is(err, ErrUnauthorized) {
t.Fatalf("expected ErrUnauthorized, got %v", err)
}
}

View File

@@ -14,10 +14,11 @@ import (
)
// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest
// server that serves canned fixtures for every Phase 10 code-hosting source,
// registers the sources (with BaseURL overrides pointing at the test server)
// onto a fresh recon.Engine, runs SweepAll, and asserts at least one Finding
// was emitted per SourceType across all ten sources.
// server that serves canned fixtures for every Phase 10 code-hosting source
// and Phase 11 search engine / paste site source, registers the sources (with
// BaseURL overrides pointing at the test server) onto a fresh recon.Engine,
// runs SweepAll, and asserts at least one Finding was emitted per SourceType
// across all 18 sources.
//
// RegisterAll cannot be used directly because it wires production URLs; the
// test exercises the same code paths by constructing each source identically
@@ -108,6 +109,64 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
_, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`))
})
// ---- Phase 11: Google Custom Search /customsearch/v1 ----
mux.HandleFunc("/customsearch/v1", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"items":[{"link":"https://pastebin.com/abc123","title":"leak","snippet":"sk-proj-xxx"}]}`))
})
// ---- Phase 11: Bing /v7.0/search ----
mux.HandleFunc("/v7.0/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"webPages":{"value":[{"url":"https://example.com/bing-leak","name":"leak"}]}}`))
})
// ---- Phase 11: DuckDuckGo /html/ ----
mux.HandleFunc("/html/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a class="result__a" href="https://example.com/ddg-leak">result</a></body></html>`))
})
// ---- Phase 11: Yandex /search/xml ----
mux.HandleFunc("/search/xml", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="utf-8"?>
<yandexsearch><response><results><grouping><group><doc><url>https://example.com/yandex-leak</url></doc></group></grouping></results></response></yandexsearch>`))
})
// ---- Phase 11: Brave /res/v1/web/search ----
mux.HandleFunc("/res/v1/web/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"web":{"results":[{"url":"https://example.com/brave-leak","title":"leak"}]}}`))
})
// ---- Phase 11: Pastebin (routed under /pb/ prefix) ----
mux.HandleFunc("/pb/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/AbCdEf12">paste1</a></body></html>`))
})
mux.HandleFunc("/pb/raw/AbCdEf12", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("leaked key: sk-proj-PASTEBIN123"))
})
// ---- Phase 11: GistPaste (routed under /gp/ prefix) ----
mux.HandleFunc("/gp/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/alice/deadbeef01">gist1</a></body></html>`))
})
mux.HandleFunc("/gp/alice/deadbeef01/raw", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("leaked: sk-proj-GISTPASTE456"))
})
// ---- Phase 11: PasteSites sub-platforms ----
mux.HandleFunc("/paste-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/aB3xZ9">paste</a></body></html>`))
})
mux.HandleFunc("/paste-raw/aB3xZ9", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("secret: sk-proj-PASTESITES789"))
})
srv := httptest.NewServer(mux)
defer srv.Close()
@@ -118,7 +177,9 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
eng := recon.NewEngine()
// GitHub — token + BaseURL override. Use the real constructor so `client`
// --- Phase 10 sources ---
// GitHub -- token + BaseURL override. Use the real constructor so `client`
// is initialized, then retarget BaseURL at the test server.
ghs := NewGitHubSource("ghp-test", reg, lim)
ghs.BaseURL = srv.URL
@@ -138,7 +199,7 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
Registry: reg,
Limiters: lim,
})
// Gist uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
// Gist -- uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
eng.Register(&GistSource{
Token: "ghp-test",
BaseURL: srv.URL,
@@ -169,7 +230,7 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
Registry: reg,
Limiters: lim,
})
// Sandboxes inject test sub-platforms that hit srv.URL.
// Sandboxes -- inject test sub-platforms that hit srv.URL.
eng.Register(&SandboxesSource{
Platforms: []subPlatform{
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
@@ -191,12 +252,64 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
client: NewClient(),
})
// Sanity: all 10 sources registered.
if n := len(eng.List()); n != 10 {
t.Fatalf("expected 10 sources on engine, got %d: %v", n, eng.List())
// --- Phase 11 sources ---
// Google Custom Search
gs := NewGoogleDorkSource("test-api-key", "test-cx", reg, lim)
gs.BaseURL = srv.URL
eng.Register(gs)
// Bing
bs := NewBingDorkSource("test-bing-key", reg, lim)
bs.BaseURL = srv.URL
eng.Register(bs)
// DuckDuckGo
ddg := NewDuckDuckGoSource(reg, lim)
ddg.BaseURL = srv.URL
eng.Register(ddg)
// Yandex
ys := NewYandexSource("test-user", "test-key", reg, lim)
ys.BaseURL = srv.URL
eng.Register(ys)
// Brave
brs := NewBraveSource("test-brave-key", reg, lim)
brs.BaseURL = srv.URL
eng.Register(brs)
// Pastebin -- uses /pb/ prefix to avoid /search collision
eng.Register(&PastebinSource{
BaseURL: srv.URL + "/pb",
Registry: reg,
Limiters: lim,
Client: NewClient(),
})
// GistPaste -- uses /gp/ prefix
eng.Register(&GistPasteSource{
BaseURL: srv.URL + "/gp",
Registry: reg,
Limiters: lim,
Client: NewClient(),
})
// PasteSites -- inject test sub-platform
eng.Register(&PasteSitesSource{
Platforms: []pastePlatform{
{
Name: "testpaste",
SearchPath: "/paste-search?q=%s",
ResultLinkRegex: `^/[a-zA-Z0-9]+$`,
RawPathTemplate: "/paste-raw%s",
},
},
Registry: reg,
Limiters: lim,
Client: NewClient(),
BaseURL: srv.URL,
})
// Sanity: all 18 sources registered.
if n := len(eng.List()); n != 18 {
t.Fatalf("expected 18 sources on engine, got %d: %v", n, eng.List())
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"})
@@ -211,6 +324,7 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
}
wantTypes := []string{
// Phase 10
"recon:github",
"recon:gitlab",
"recon:bitbucket",
@@ -221,6 +335,15 @@ func TestIntegration_AllSources_SweepAll(t *testing.T) {
"recon:codesandbox",
"recon:sandboxes",
"recon:kaggle",
// Phase 11
"recon:google",
"recon:bing",
"recon:duckduckgo",
"recon:yandex",
"recon:brave",
"recon:pastebin",
"recon:gistpaste",
"recon:pastesites",
}
for _, st := range wantTypes {
if byType[st] == 0 {

View File

@@ -0,0 +1,156 @@
package sources
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// PastebinSource scrapes pastebin.com search results for API key leaks.
//
// Two-phase approach per keyword:
// - Phase A: search pastebin for keyword, extract paste IDs from result links
// - Phase B: fetch raw paste content, keyword-match against provider registry
//
// Auth: none (credential-free). Rate: Every(3s), Burst 1 (conservative scraping).
type PastebinSource struct {
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
}
// pastebinIDRE matches Pastebin paste links: /XXXXXXXX (8 alphanumeric chars).
var pastebinIDRE = regexp.MustCompile(`^/[A-Za-z0-9]{8}$`)
// Compile-time assertion.
var _ recon.ReconSource = (*PastebinSource)(nil)
func (s *PastebinSource) Name() string { return "pastebin" }
func (s *PastebinSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *PastebinSource) Burst() int { return 1 }
func (s *PastebinSource) RespectsRobots() bool { return true }
// Enabled always returns true: Pastebin scraping requires no credentials.
func (s *PastebinSource) Enabled(_ recon.Config) bool { return true }
// Sweep searches Pastebin for each provider keyword and scans raw paste content.
func (s *PastebinSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
base := s.BaseURL
if base == "" {
base = "https://pastebin.com"
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "pastebin")
if len(queries) == 0 {
return nil
}
keywords := pastebinKeywordSet(s.Registry)
if len(keywords) == 0 {
return nil
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
// Phase A: search for paste links.
searchURL := fmt.Sprintf("%s/search?q=%s", base, url.QueryEscape(q))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
if err != nil {
return fmt.Errorf("pastebin: build search req: %w", err)
}
resp, err := client.Do(ctx, req)
if err != nil {
return fmt.Errorf("pastebin: search fetch: %w", err)
}
ids, err := extractAnchorHrefs(resp.Body, pastebinIDRE)
_ = resp.Body.Close()
if err != nil {
return fmt.Errorf("pastebin: parse search html: %w", err)
}
// Phase B: fetch raw content and keyword-match.
for _, idPath := range ids {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
rawURL := fmt.Sprintf("%s/raw%s", base, idPath)
rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return fmt.Errorf("pastebin: build raw req: %w", err)
}
rawResp, err := client.Do(ctx, rawReq)
if err != nil {
// Skip this paste on fetch error, continue to next.
continue
}
body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024))
_ = rawResp.Body.Close()
if readErr != nil {
continue
}
content := string(body)
for kw, provName := range keywords {
if strings.Contains(content, kw) {
out <- recon.Finding{
ProviderName: provName,
Source: fmt.Sprintf("%s%s", base, idPath),
SourceType: "recon:pastebin",
Confidence: "low",
DetectedAt: time.Now(),
}
break // one finding per paste
}
}
}
}
return nil
}
// pastebinKeywordSet builds keyword->providerName map from registry.
func pastebinKeywordSet(reg *providers.Registry) map[string]string {
out := make(map[string]string)
if reg == nil {
return out
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
if k == "" {
continue
}
if _, ok := out[k]; !ok {
out[k] = p.Name
}
}
}
return out
}

View File

@@ -0,0 +1,120 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func pastebinTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
const pastebinSearchHTML = `<!doctype html>
<html><body>
<a href="/Ab12Cd34">paste one</a>
<a href="/Ef56Gh78">paste two</a>
<a href="/about">nope</a>
<a href="/toolong123">nine chars nope</a>
</body></html>`
const pastebinRawContent1 = `some text with sk-proj-AAAA1234 leaked here`
const pastebinRawContent2 = `nothing interesting in this paste`
func TestPastebin_Sweep_ExtractsFindings(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/search":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(pastebinSearchHTML))
case r.URL.Path == "/raw/Ab12Cd34":
_, _ = w.Write([]byte(pastebinRawContent1))
case r.URL.Path == "/raw/Ef56Gh78":
_, _ = w.Write([]byte(pastebinRawContent2))
default:
http.NotFound(w, r)
}
}))
defer srv.Close()
src := &PastebinSource{
BaseURL: srv.URL,
Registry: pastebinTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 16)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
// Only paste one has "sk-proj-", paste two doesn't match.
if len(findings) != 1 {
t.Fatalf("expected 1 finding, got %d", len(findings))
}
f := findings[0]
if f.SourceType != "recon:pastebin" {
t.Errorf("SourceType=%s, want recon:pastebin", f.SourceType)
}
if f.ProviderName != "openai" {
t.Errorf("ProviderName=%s, want openai", f.ProviderName)
}
wantSource := srv.URL + "/Ab12Cd34"
if f.Source != wantSource {
t.Errorf("Source=%s, want %s", f.Source, wantSource)
}
}
func TestPastebin_NameAndRate(t *testing.T) {
s := &PastebinSource{}
if s.Name() != "pastebin" {
t.Errorf("Name=%s", s.Name())
}
if s.Burst() != 1 {
t.Errorf("Burst=%d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
if !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true")
}
}
func TestPastebin_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(pastebinSearchHTML))
}))
defer srv.Close()
src := &PastebinSource{
BaseURL: srv.URL,
Registry: pastebinTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}

View File

@@ -0,0 +1,242 @@
package sources
import (
"context"
"errors"
"fmt"
"io"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// pastePlatform describes one paste site sub-source aggregated under the
// "pastesites" umbrella. Follows the same multi-platform pattern as
// SandboxesSource.
//
// SearchPath is a printf format string with one %s for the URL-escaped query.
// RawPathTemplate, if non-empty, converts a matched link path into the raw
// content endpoint (e.g. "/raw%s" prepends /raw to the paste path).
type pastePlatform struct {
Name string
SearchPath string
ResultLinkRegex string
RawPathTemplate string // fmt with %s for matched path or extracted ID
}
// defaultPastePlatforms returns the production paste site list.
func defaultPastePlatforms() []pastePlatform {
return []pastePlatform{
{
Name: "dpaste",
SearchPath: "https://dpaste.org/search/?q=%s",
ResultLinkRegex: `^/[A-Za-z0-9]+$`,
RawPathTemplate: "%s/raw",
},
{
Name: "paste.ee",
SearchPath: "https://paste.ee/search?q=%s",
ResultLinkRegex: `^/p/[A-Za-z0-9]+$`,
RawPathTemplate: "/r%s", // /p/ID -> /r/p/ID ... actually /r/ID
},
{
Name: "rentry",
SearchPath: "https://rentry.co/search?q=%s",
ResultLinkRegex: `^/[a-z0-9-]+$`,
RawPathTemplate: "%s/raw",
},
{
Name: "hastebin",
SearchPath: "https://hastebin.com/search?q=%s",
ResultLinkRegex: `^/[a-z]+$`,
RawPathTemplate: "/raw%s",
},
}
}
// PasteSitesSource aggregates several paste sites into a single ReconSource.
// Each sub-platform is scraped independently; failures in one are logged and
// skipped without aborting the others.
//
// Every emitted Finding carries SourceType="recon:pastesites" and encodes the
// originating sub-platform in KeyMasked as "platform=<name>".
type PasteSitesSource struct {
Platforms []pastePlatform
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
// BaseURL, when non-empty, is prefixed to any relative SearchPath (tests).
BaseURL string
}
// Compile-time assertion.
var _ recon.ReconSource = (*PasteSitesSource)(nil)
func (s *PasteSitesSource) Name() string { return "pastesites" }
func (s *PasteSitesSource) RateLimit() rate.Limit { return rate.Every(3 * time.Second) }
func (s *PasteSitesSource) Burst() int { return 1 }
func (s *PasteSitesSource) RespectsRobots() bool { return true }
// Enabled always returns true: all paste site scraping is credential-free.
func (s *PasteSitesSource) Enabled(_ recon.Config) bool { return true }
// Sweep iterates each paste platform across each provider keyword. Per-platform
// errors are logged and skipped so one broken sub-source does not fail the
// overall sweep.
func (s *PasteSitesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
plats := s.Platforms
if plats == nil {
plats = defaultPastePlatforms()
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "pastesites")
if len(queries) == 0 {
return nil
}
keywords := pasteSitesKeywordSet(s.Registry)
if len(keywords) == 0 {
return nil
}
for _, p := range plats {
if err := ctx.Err(); err != nil {
return err
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
if err := s.sweepPastePlatform(ctx, client, p, q, keywords, out); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
log.Printf("pastesites: platform %q failed (skipping): %v", p.Name, err)
break // next platform
}
}
}
return nil
}
// sweepPastePlatform performs a search on one paste platform, fetches raw
// content for each result link, and emits findings for keyword matches.
func (s *PasteSitesSource) sweepPastePlatform(
ctx context.Context,
client *Client,
p pastePlatform,
query string,
keywords map[string]string,
out chan<- recon.Finding,
) error {
rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query))
if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") {
rawURL = s.BaseURL + rawURL
}
re, err := regexp.Compile(p.ResultLinkRegex)
if err != nil {
return fmt.Errorf("bad regex: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return fmt.Errorf("build req: %w", err)
}
resp, err := client.Do(ctx, req)
if err != nil {
return fmt.Errorf("fetch: %w", err)
}
links, err := extractAnchorHrefs(resp.Body, re)
_ = resp.Body.Close()
if err != nil {
return fmt.Errorf("parse html: %w", err)
}
// Determine base for absolute URLs from the search URL.
searchParsed, _ := url.Parse(rawURL)
scheme := searchParsed.Scheme
host := searchParsed.Host
for _, linkPath := range links {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
// Build raw content URL.
rawPath := fmt.Sprintf(p.RawPathTemplate, linkPath)
fetchURL := fmt.Sprintf("%s://%s%s", scheme, host, rawPath)
rawReq, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL, nil)
if err != nil {
continue
}
rawResp, err := client.Do(ctx, rawReq)
if err != nil {
continue // skip this paste on error
}
body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024))
_ = rawResp.Body.Close()
if readErr != nil {
continue
}
content := string(body)
for kw, provName := range keywords {
if strings.Contains(content, kw) {
pasteURL := fmt.Sprintf("%s://%s%s", scheme, host, linkPath)
out <- recon.Finding{
ProviderName: provName,
Source: pasteURL,
SourceType: "recon:pastesites",
KeyMasked: "platform=" + p.Name,
Confidence: "low",
DetectedAt: time.Now(),
}
break // one finding per paste
}
}
}
return nil
}
// pasteSitesKeywordSet builds keyword->providerName map from registry.
func pasteSitesKeywordSet(reg *providers.Registry) map[string]string {
out := make(map[string]string)
if reg == nil {
return out
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
if k == "" {
continue
}
if _, ok := out[k]; !ok {
out[k] = p.Name
}
}
}
return out
}

View File

@@ -0,0 +1,190 @@
package sources
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
func pasteSitesTestRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
}
// Fixture HTML for each sub-platform search result page.
const dpasteSearchHTML = `<html><body><a href="/AbcDef12">dpaste hit</a></body></html>`
const pasteEeSearchHTML = `<html><body><a href="/p/Xyz789">paste.ee hit</a></body></html>`
const rentrySearchHTML = `<html><body><a href="/my-paste">rentry hit</a></body></html>`
const hastebinSearchHTML = `<html><body><a href="/abcdef">hastebin hit</a></body></html>`
// Raw content fixtures -- some match, some don't.
const dpasteRaw = `leaked: sk-proj-AAAA1234 oops`
const pasteEeRaw = `config sk-proj-BBBBB5678 here`
const rentryRaw = `has sk-proj-CCCC9012 inside`
const hastebinRaw = `nothing interesting`
func TestPasteSites_Sweep_ExtractsFindings(t *testing.T) {
mux := http.NewServeMux()
// dpaste routes
mux.HandleFunc("/dpaste-search/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(dpasteSearchHTML))
})
mux.HandleFunc("/AbcDef12/raw", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(dpasteRaw))
})
// paste.ee routes
mux.HandleFunc("/pasteee-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(pasteEeSearchHTML))
})
mux.HandleFunc("/r/p/Xyz789", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(pasteEeRaw))
})
// rentry routes
mux.HandleFunc("/rentry-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(rentrySearchHTML))
})
mux.HandleFunc("/my-paste/raw", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(rentryRaw))
})
// hastebin routes
mux.HandleFunc("/hastebin-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(hastebinSearchHTML))
})
mux.HandleFunc("/raw/abcdef", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(hastebinRaw))
})
srv := httptest.NewServer(mux)
defer srv.Close()
// Override platforms to use test server with relative paths.
testPlats := []pastePlatform{
{
Name: "dpaste",
SearchPath: srv.URL + "/dpaste-search/?q=%s",
ResultLinkRegex: `^/[A-Za-z0-9]+$`,
RawPathTemplate: "%s/raw",
},
{
Name: "paste.ee",
SearchPath: srv.URL + "/pasteee-search?q=%s",
ResultLinkRegex: `^/p/[A-Za-z0-9]+$`,
RawPathTemplate: "/r%s",
},
{
Name: "rentry",
SearchPath: srv.URL + "/rentry-search?q=%s",
ResultLinkRegex: `^/[a-z0-9-]+$`,
RawPathTemplate: "%s/raw",
},
{
Name: "hastebin",
SearchPath: srv.URL + "/hastebin-search?q=%s",
ResultLinkRegex: `^/[a-z]+$`,
RawPathTemplate: "/raw%s",
},
}
src := &PasteSitesSource{
Platforms: testPlats,
Registry: pasteSitesTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
if err := src.Sweep(ctx, "", out); err != nil {
t.Fatalf("Sweep err: %v", err)
}
close(out)
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
// dpaste, paste.ee, rentry have matching content; hastebin does not.
if len(findings) < 3 {
t.Fatalf("expected at least 3 findings (dpaste+paste.ee+rentry), got %d", len(findings))
}
platforms := make(map[string]bool)
for _, f := range findings {
if f.SourceType != "recon:pastesites" {
t.Errorf("SourceType=%s, want recon:pastesites", f.SourceType)
}
// Extract platform from KeyMasked.
if len(f.KeyMasked) > len("platform=") {
platforms[f.KeyMasked[len("platform="):]] = true
}
}
for _, want := range []string{"dpaste", "paste.ee", "rentry"} {
if !platforms[want] {
t.Errorf("missing platform %q in findings; got platforms=%v", want, platforms)
}
}
}
func TestPasteSites_NameAndRate(t *testing.T) {
s := &PasteSitesSource{}
if s.Name() != "pastesites" {
t.Errorf("Name=%s", s.Name())
}
if s.Burst() != 1 {
t.Errorf("Burst=%d", s.Burst())
}
if !s.RespectsRobots() {
t.Error("expected RespectsRobots=true")
}
if !s.Enabled(recon.Config{}) {
t.Error("expected Enabled=true")
}
}
func TestPasteSites_Sweep_CtxCancelled(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
time.Sleep(500 * time.Millisecond)
_, _ = w.Write([]byte(`<html><body></body></html>`))
}))
defer srv.Close()
testPlats := []pastePlatform{
{
Name: "test",
SearchPath: srv.URL + "/search?q=%s",
ResultLinkRegex: `^/[a-z]+$`,
RawPathTemplate: "/raw%s",
},
}
src := &PasteSitesSource{
Platforms: testPlats,
Registry: pasteSitesTestRegistry(),
Limiters: recon.NewLimiterRegistry(),
Client: NewClient(),
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 4)
if err := src.Sweep(ctx, "", out); err == nil {
t.Fatal("expected ctx error")
}
}

View File

@@ -47,6 +47,8 @@ func formatQuery(source, keyword string) string {
switch source {
case "github", "gist":
return fmt.Sprintf("%q in:file", keyword)
case "google", "bing", "duckduckgo", "yandex", "brave":
return fmt.Sprintf(`site:pastebin.com OR site:github.com "%s"`, keyword)
default:
// GitLab, Bitbucket, Codeberg, HuggingFace, Kaggle, Replit,
// CodeSandbox, sandboxes, and unknown sources use bare keywords.

View File

@@ -28,20 +28,32 @@ type SourcesConfig struct {
KaggleUser string
KaggleKey string
// Google Custom Search API key and search engine ID (CX).
GoogleAPIKey string
GoogleCX string
// Bing Web Search API subscription key.
BingAPIKey string
// Yandex XML Search user and API key.
YandexUser string
YandexAPIKey string
// Brave Search API subscription token.
BraveAPIKey string
// Registry drives query generation for every source via BuildQueries.
Registry *providers.Registry
// Limiters is the shared per-source rate-limiter registry.
Limiters *recon.LimiterRegistry
}
// RegisterAll registers every Phase 10 code-hosting source on engine.
// RegisterAll registers every Phase 10 code-hosting and Phase 11 search
// engine / paste site source on engine (18 sources total).
//
// All ten sources are registered unconditionally so that cmd/recon.go can
// surface the full catalog via `keyhunter recon list` regardless of which
// credentials are configured. Sources without required credentials return
// Enabled()==false so SweepAll skips them without erroring.
// All sources are registered unconditionally so that cmd/recon.go can surface
// the full catalog via `keyhunter recon list` regardless of which credentials
// are configured. Sources without required credentials return Enabled()==false
// so SweepAll skips them without erroring.
//
// A nil engine is treated as a no-op (not an error) callers in broken init
// A nil engine is treated as a no-op (not an error) -- callers in broken init
// paths shouldn't panic.
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
if engine == nil {
@@ -95,4 +107,46 @@ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
Registry: reg,
Limiters: lim,
})
// Phase 11: Search engine dorking sources.
engine.Register(&GoogleDorkSource{
APIKey: cfg.GoogleAPIKey,
CX: cfg.GoogleCX,
Registry: reg,
Limiters: lim,
})
engine.Register(&BingDorkSource{
APIKey: cfg.BingAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&DuckDuckGoSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&YandexSource{
User: cfg.YandexUser,
APIKey: cfg.YandexAPIKey,
Registry: reg,
Limiters: lim,
})
engine.Register(&BraveSource{
APIKey: cfg.BraveAPIKey,
Registry: reg,
Limiters: lim,
})
// Phase 11: Paste site sources.
engine.Register(&PastebinSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&GistPasteSource{
Registry: reg,
Limiters: lim,
})
engine.Register(&PasteSitesSource{
Registry: reg,
Limiters: lim,
})
}

View File

@@ -16,9 +16,9 @@ func registerTestRegistry() *providers.Registry {
})
}
// TestRegisterAll_WiresAllTenSources asserts that RegisterAll registers every
// Phase 10 code-hosting source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllTenSources(t *testing.T) {
// TestRegisterAll_WiresAllEighteenSources asserts that RegisterAll registers
// every Phase 10 + Phase 11 source by its stable name on a fresh engine.
func TestRegisterAll_WiresAllEighteenSources(t *testing.T) {
eng := recon.NewEngine()
cfg := SourcesConfig{
Registry: registerTestRegistry(),
@@ -28,16 +28,24 @@ func TestRegisterAll_WiresAllTenSources(t *testing.T) {
got := eng.List()
want := []string{
"bing",
"bitbucket",
"brave",
"codeberg",
"codesandbox",
"duckduckgo",
"gist",
"gistpaste",
"github",
"gitlab",
"google",
"huggingface",
"kaggle",
"pastebin",
"pastesites",
"replit",
"sandboxes",
"yandex",
}
if !reflect.DeepEqual(got, want) {
t.Fatalf("RegisterAll names mismatch\n got: %v\nwant: %v", got, want)
@@ -55,8 +63,8 @@ func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) {
Limiters: recon.NewLimiterRegistry(),
})
if n := len(eng.List()); n != 10 {
t.Fatalf("expected 10 sources registered, got %d: %v", n, eng.List())
if n := len(eng.List()); n != 18 {
t.Fatalf("expected 18 sources registered, got %d: %v", n, eng.List())
}
// SweepAll with an empty config should filter out cred-gated sources

177
pkg/recon/sources/yandex.go Normal file
View File

@@ -0,0 +1,177 @@
package sources
import (
"context"
"encoding/xml"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// YandexSource implements recon.ReconSource against the Yandex XML Search API.
// It requires both a User and APIKey to be enabled.
type YandexSource struct {
User string
APIKey string
BaseURL string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
client *Client
}
// Compile-time assertion.
var _ recon.ReconSource = (*YandexSource)(nil)
// NewYandexSource constructs a YandexSource with the shared retry client.
func NewYandexSource(user, apiKey string, reg *providers.Registry, lim *recon.LimiterRegistry) *YandexSource {
return &YandexSource{
User: user,
APIKey: apiKey,
BaseURL: "https://yandex.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
}
}
func (s *YandexSource) Name() string { return "yandex" }
func (s *YandexSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
func (s *YandexSource) Burst() int { return 1 }
func (s *YandexSource) RespectsRobots() bool { return false }
// Enabled returns true only when both User and APIKey are configured.
func (s *YandexSource) Enabled(_ recon.Config) bool {
return s.User != "" && s.APIKey != ""
}
// Sweep issues one Yandex XML search request per provider keyword and emits a
// Finding for every <url> element in the response.
func (s *YandexSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
if s.User == "" || s.APIKey == "" {
return nil
}
base := s.BaseURL
if base == "" {
base = "https://yandex.com"
}
queries := BuildQueries(s.Registry, "yandex")
kwIndex := yandexKeywordIndex(s.Registry)
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
endpoint := fmt.Sprintf("%s/search/xml?user=%s&key=%s&query=%s&l10n=en&sortby=rlv&filter=none&groupby=%s",
base,
url.QueryEscape(s.User),
url.QueryEscape(s.APIKey),
url.QueryEscape(q),
url.QueryEscape(`attr="".mode=flat.groups-on-page=50`))
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return fmt.Errorf("yandex: build request: %w", err)
}
req.Header.Set("User-Agent", "keyhunter-recon")
resp, err := s.client.Do(ctx, req)
if err != nil {
if errors.Is(err, ErrUnauthorized) {
return err
}
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
continue
}
var parsed yandexSearchResponse
decErr := xml.NewDecoder(resp.Body).Decode(&parsed)
_ = resp.Body.Close()
if decErr != nil {
continue
}
provName := kwIndex[strings.ToLower(extractGoogleKeyword(q))]
for _, grp := range parsed.Response.Results.Grouping.Groups {
for _, doc := range grp.Docs {
if doc.URL == "" {
continue
}
f := recon.Finding{
ProviderName: provName,
Confidence: "low",
Source: doc.URL,
SourceType: "recon:yandex",
DetectedAt: time.Now(),
}
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
}
}
return nil
}
// XML response structures for Yandex XML Search API.
type yandexSearchResponse struct {
XMLName xml.Name `xml:"yandexsearch"`
Response yandexResponse `xml:"response"`
}
type yandexResponse struct {
Results yandexResults `xml:"results"`
}
type yandexResults struct {
Grouping yandexGrouping `xml:"grouping"`
}
type yandexGrouping struct {
Groups []yandexGroup `xml:"group"`
}
type yandexGroup struct {
Docs []yandexDoc `xml:"doc"`
}
type yandexDoc struct {
URL string `xml:"url"`
}
// yandexKeywordIndex maps lowercased keywords to provider names.
func yandexKeywordIndex(reg *providers.Registry) map[string]string {
m := make(map[string]string)
if reg == nil {
return m
}
for _, p := range reg.List() {
for _, k := range p.Keywords {
kl := strings.ToLower(strings.TrimSpace(k))
if kl == "" {
continue
}
if _, exists := m[kl]; !exists {
m[kl] = p.Name
}
}
}
return m
}

View File

@@ -0,0 +1,171 @@
package sources
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
const yandexXMLFixture = `<?xml version="1.0" encoding="utf-8"?>
<yandexsearch>
<response>
<results>
<grouping>
<group>
<doc>
<url>https://pastebin.com/yandex1</url>
</doc>
</group>
<group>
<doc>
<url>https://github.com/user/repo/blob/main/secrets.env</url>
</doc>
<doc>
<url>https://example.com/leaked</url>
</doc>
</group>
</grouping>
</results>
</response>
</yandexsearch>`
func yandexStubHandler(t *testing.T, calls *int32) http.HandlerFunc {
t.Helper()
return func(w http.ResponseWriter, r *http.Request) {
atomic.AddInt32(calls, 1)
if !strings.HasPrefix(r.URL.Path, "/search/xml") {
t.Errorf("unexpected path: %s", r.URL.Path)
}
if r.URL.Query().Get("user") != "testuser" {
t.Errorf("missing user param")
}
if r.URL.Query().Get("key") != "testkey" {
t.Errorf("missing key param")
}
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(yandexXMLFixture))
}
}
func TestYandexSource_EnabledRequiresBoth(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
tests := []struct {
user, key string
want bool
}{
{"", "", false},
{"user", "", false},
{"", "key", false},
{"user", "key", true},
}
for _, tt := range tests {
s := NewYandexSource(tt.user, tt.key, reg, lim)
if got := s.Enabled(recon.Config{}); got != tt.want {
t.Errorf("Enabled(user=%q, key=%q) = %v, want %v", tt.user, tt.key, got, tt.want)
}
}
}
func TestYandexSource_SweepEmptyCredsReturnsNil(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
s := NewYandexSource("", "", reg, lim)
out := make(chan recon.Finding, 10)
if err := s.Sweep(context.Background(), "", out); err != nil {
t.Fatalf("expected nil, got %v", err)
}
close(out)
if n := countFindings(out); n != 0 {
t.Fatalf("expected 0 findings, got %d", n)
}
}
func TestYandexSource_SweepEmitsFindings(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("yandex", 1000, 100)
var calls int32
srv := httptest.NewServer(yandexStubHandler(t, &calls))
defer srv.Close()
s := NewYandexSource("testuser", "testkey", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 32)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
done := make(chan error, 1)
go func() { done <- s.Sweep(ctx, "", out); close(out) }()
var findings []recon.Finding
for f := range out {
findings = append(findings, f)
}
if err := <-done; err != nil {
t.Fatalf("Sweep error: %v", err)
}
// 2 keywords * 3 URLs in XML = 6 findings
if len(findings) != 6 {
t.Fatalf("expected 6 findings, got %d", len(findings))
}
for _, f := range findings {
if f.SourceType != "recon:yandex" {
t.Errorf("SourceType=%q want recon:yandex", f.SourceType)
}
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 calls, got %d", got)
}
}
func TestYandexSource_CtxCancelled(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("yandex", 1000, 100)
s := NewYandexSource("user", "key", reg, lim)
s.BaseURL = "http://127.0.0.1:1"
ctx, cancel := context.WithCancel(context.Background())
cancel()
out := make(chan recon.Finding, 1)
err := s.Sweep(ctx, "", out)
if !errors.Is(err, context.Canceled) {
t.Fatalf("expected context.Canceled, got %v", err)
}
}
func TestYandexSource_Unauthorized(t *testing.T) {
reg := syntheticRegistry()
lim := recon.NewLimiterRegistry()
_ = lim.For("yandex", 1000, 100)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusUnauthorized)
_, _ = w.Write([]byte("bad creds"))
}))
defer srv.Close()
s := NewYandexSource("user", "key", reg, lim)
s.BaseURL = srv.URL
out := make(chan recon.Finding, 1)
err := s.Sweep(context.Background(), "", out)
if !errors.Is(err, ErrUnauthorized) {
t.Fatalf("expected ErrUnauthorized, got %v", err)
}
}