Compare commits
22 Commits
12c402ab67
...
1acbedc03a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1acbedc03a | ||
|
|
e00fb172ab | ||
|
|
8528108613 | ||
|
|
fb3e57382e | ||
|
|
4628ccfe90 | ||
|
|
a034eeb14c | ||
|
|
a0b8f99a7f | ||
|
|
430ace9a9a | ||
|
|
91becd961f | ||
|
|
6928ca4e70 | ||
|
|
21d5551aa4 | ||
|
|
3d3c57fff2 | ||
|
|
4fafc01052 | ||
|
|
0e16e8ea4c | ||
|
|
223c23e672 | ||
|
|
cae714b488 | ||
|
|
792ac8d54b | ||
|
|
0137dc57b1 | ||
|
|
39001f208c | ||
|
|
45f8782464 | ||
|
|
d279abf449 | ||
|
|
243b7405cd |
@@ -21,7 +21,7 @@ Decimal phases appear between their surrounding integers in numeric order.
|
||||
- [ ] **Phase 7: Import Adapters & CI/CD Integration** - TruffleHog/Gitleaks import + pre-commit hooks + SARIF to GitHub Security
|
||||
- [ ] **Phase 8: Dork Engine** - YAML-based dork definitions with 150+ built-in dorks and management commands
|
||||
- [ ] **Phase 9: OSINT Infrastructure** - Per-source rate limiter architecture and recon engine framework before any sources
|
||||
- [ ] **Phase 10: OSINT Code Hosting** - GitHub, GitLab, Bitbucket, HuggingFace and 6 more code hosting sources
|
||||
- [x] **Phase 10: OSINT Code Hosting** - GitHub, GitLab, Bitbucket, HuggingFace and 6 more code hosting sources (completed 2026-04-05)
|
||||
- [ ] **Phase 11: OSINT Search & Paste** - Search engine dorking and paste site aggregation
|
||||
- [ ] **Phase 12: OSINT IoT & Cloud Storage** - Shodan/Censys/ZoomEye/FOFA and S3/GCS/Azure cloud storage scanning
|
||||
- [ ] **Phase 13: OSINT Package Registries & Container/IaC** - npm/PyPI/crates.io and Docker Hub/K8s/Terraform scanning
|
||||
@@ -219,13 +219,13 @@ Plans:
|
||||
Plans:
|
||||
- [x] 10-01-PLAN.md — Shared HTTP client + provider-query generator + RegisterAll skeleton
|
||||
- [x] 10-02-PLAN.md — GitHubSource (RECON-CODE-01)
|
||||
- [ ] 10-03-PLAN.md — GitLabSource (RECON-CODE-02)
|
||||
- [ ] 10-04-PLAN.md — BitbucketSource + GistSource (RECON-CODE-03, RECON-CODE-04)
|
||||
- [ ] 10-05-PLAN.md — CodebergSource/Gitea (RECON-CODE-05)
|
||||
- [ ] 10-06-PLAN.md — HuggingFaceSource (RECON-CODE-08)
|
||||
- [x] 10-03-PLAN.md — GitLabSource (RECON-CODE-02)
|
||||
- [x] 10-04-PLAN.md — BitbucketSource + GistSource (RECON-CODE-03, RECON-CODE-04)
|
||||
- [x] 10-05-PLAN.md — CodebergSource/Gitea (RECON-CODE-05)
|
||||
- [x] 10-06-PLAN.md — HuggingFaceSource (RECON-CODE-08)
|
||||
- [x] 10-07-PLAN.md — Replit + CodeSandbox + Sandboxes scrapers (RECON-CODE-06, RECON-CODE-07, RECON-CODE-10)
|
||||
- [ ] 10-08-PLAN.md — KaggleSource (RECON-CODE-09)
|
||||
- [ ] 10-09-PLAN.md — RegisterAll wiring + CLI integration + end-to-end test
|
||||
- [x] 10-08-PLAN.md — KaggleSource (RECON-CODE-09)
|
||||
- [x] 10-09-PLAN.md — RegisterAll wiring + CLI integration + end-to-end test
|
||||
|
||||
### Phase 11: OSINT Search & Paste
|
||||
**Goal**: Users can run automated search engine dorking against Google, Bing, DuckDuckGo, Yandex, and Brave, and scan 15+ paste site aggregations for leaked API keys
|
||||
@@ -336,7 +336,7 @@ Phases execute in numeric order: 1 → 2 → 3 → ... → 18
|
||||
| 7. Import Adapters & CI/CD Integration | 0/? | Not started | - |
|
||||
| 8. Dork Engine | 0/? | Not started | - |
|
||||
| 9. OSINT Infrastructure | 2/6 | In Progress| |
|
||||
| 10. OSINT Code Hosting | 3/9 | In Progress| |
|
||||
| 10. OSINT Code Hosting | 9/9 | Complete | 2026-04-05 |
|
||||
| 11. OSINT Search & Paste | 0/? | Not started | - |
|
||||
| 12. OSINT IoT & Cloud Storage | 0/? | Not started | - |
|
||||
| 13. OSINT Package Registries & Container/IaC | 0/? | Not started | - |
|
||||
|
||||
@@ -3,14 +3,14 @@ gsd_state_version: 1.0
|
||||
milestone: v1.0
|
||||
milestone_name: milestone
|
||||
status: executing
|
||||
stopped_at: Completed 10-07-PLAN.md
|
||||
last_updated: "2026-04-05T22:19:41.729Z"
|
||||
stopped_at: Completed 10-09-PLAN.md
|
||||
last_updated: "2026-04-05T22:28:27.416Z"
|
||||
last_activity: 2026-04-05
|
||||
progress:
|
||||
total_phases: 18
|
||||
completed_phases: 9
|
||||
completed_phases: 10
|
||||
total_plans: 62
|
||||
completed_plans: 57
|
||||
completed_plans: 63
|
||||
percent: 20
|
||||
---
|
||||
|
||||
@@ -26,7 +26,7 @@ See: .planning/PROJECT.md (updated 2026-04-04)
|
||||
## Current Position
|
||||
|
||||
Phase: 10 (osint-code-hosting) — EXECUTING
|
||||
Plan: 3 of 9
|
||||
Plan: 4 of 9
|
||||
Status: Ready to execute
|
||||
Last activity: 2026-04-05
|
||||
|
||||
@@ -88,6 +88,7 @@ Progress: [██░░░░░░░░] 20%
|
||||
| Phase 10-osint-code-hosting P01 | 4m | 2 tasks | 7 files |
|
||||
| Phase 10-osint-code-hosting P02 | 5min | 1 tasks | 2 files |
|
||||
| Phase 10-osint-code-hosting P07 | 6 | 2 tasks | 6 files |
|
||||
| Phase 10 P09 | 12min | 2 tasks | 5 files |
|
||||
|
||||
## Accumulated Context
|
||||
|
||||
@@ -124,6 +125,7 @@ Recent decisions affecting current work:
|
||||
- [Phase 10-osint-code-hosting]: Client handles retry only; rate limiting is caller's responsibility via LimiterRegistry
|
||||
- [Phase 10-osint-code-hosting]: github/gist use 'kw' in:file; all other sources use bare keyword
|
||||
- [Phase 10-osint-code-hosting]: GitHubSource reuses shared sources.Client + LimiterRegistry; builds queries from providers.Registry via BuildQueries; missing token disables (not errors)
|
||||
- [Phase 10]: RegisterAll registers all ten Phase 10 sources unconditionally; missing credentials flip Enabled()==false rather than hiding sources from the CLI catalog
|
||||
|
||||
### Pending Todos
|
||||
|
||||
@@ -138,6 +140,6 @@ None yet.
|
||||
|
||||
## Session Continuity
|
||||
|
||||
Last session: 2026-04-05T22:19:41.725Z
|
||||
Stopped at: Completed 10-07-PLAN.md
|
||||
Last session: 2026-04-05T22:28:27.412Z
|
||||
Stopped at: Completed 10-09-PLAN.md
|
||||
Resume file: None
|
||||
|
||||
90
.planning/phases/10-osint-code-hosting/10-03-SUMMARY.md
Normal file
90
.planning/phases/10-osint-code-hosting/10-03-SUMMARY.md
Normal file
@@ -0,0 +1,90 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 03
|
||||
subsystem: recon/sources
|
||||
tags: [recon, osint, gitlab, wave-2]
|
||||
requires:
|
||||
- pkg/recon/sources.Client (Plan 10-01)
|
||||
- pkg/recon/sources.BuildQueries (Plan 10-01)
|
||||
- pkg/recon.LimiterRegistry (Phase 9)
|
||||
- pkg/providers.Registry
|
||||
provides:
|
||||
- pkg/recon/sources.GitLabSource
|
||||
affects:
|
||||
- pkg/recon/sources
|
||||
tech_stack_added: []
|
||||
patterns:
|
||||
- "Per-keyword BuildQueries loop driving search API calls"
|
||||
- "PRIVATE-TOKEN header auth with shared retry-aware Client"
|
||||
- "Disabled-when-empty-token semantics (Sweep returns nil with no requests)"
|
||||
- "Bare-keyword → provider-name lookup via local keyword index"
|
||||
key_files_created:
|
||||
- pkg/recon/sources/gitlab.go
|
||||
- pkg/recon/sources/gitlab_test.go
|
||||
key_files_modified: []
|
||||
decisions:
|
||||
- "Bare keyword BuildQueries output (gitlab case in formatQuery) — reverse lookup is a direct map[string]string access"
|
||||
- "gitlabKeywordIndex helper named with gitlab prefix to avoid collision with peer github.go keywordIndex during parallel wave"
|
||||
- "Finding.Source uses constructed /projects/<id>/-/blob/<ref>/<path> URL (per plan) rather than extra /api/v4/projects/<id> lookup to keep request budget tight"
|
||||
- "Confidence=low across all recon findings; Phase 5 verify promotes to high"
|
||||
metrics:
|
||||
duration: ~8 minutes
|
||||
completed_date: 2026-04-05
|
||||
tasks_completed: 1
|
||||
tests_added: 6
|
||||
---
|
||||
|
||||
# Phase 10 Plan 03: GitLabSource Summary
|
||||
|
||||
GitLabSource is a thin recon.ReconSource that queries GitLab's `/api/v4/search?scope=blobs` endpoint with a PRIVATE-TOKEN header, iterating one search call per provider keyword from the shared BuildQueries helper and emitting a Finding per returned blob with Source pointing at a constructed `projects/<id>/-/blob/<ref>/<path>` URL.
|
||||
|
||||
## What Was Built
|
||||
|
||||
`pkg/recon/sources/gitlab.go` contains:
|
||||
|
||||
- `GitLabSource` struct exposing Token, BaseURL, Registry, Limiters (lazy Client)
|
||||
- ReconSource interface methods: `Name()="gitlab"`, `RateLimit()=rate.Every(30ms)`, `Burst()=5`, `RespectsRobots()=false`, `Enabled()` (token non-empty), `Sweep()`
|
||||
- `glBlob` response DTO matching GitLab's documented blob search schema
|
||||
- `gitlabKeywordIndex()` local helper (prefixed to avoid colliding with peer plan helpers during parallel wave execution)
|
||||
- Compile-time `var _ recon.ReconSource = (*GitLabSource)(nil)` assertion
|
||||
|
||||
`pkg/recon/sources/gitlab_test.go` covers all behaviors the plan called out:
|
||||
|
||||
| Test | Verifies |
|
||||
| --- | --- |
|
||||
| `TestGitLabSource_EnabledFalseWhenTokenEmpty` | Enabled gating + Name/RespectsRobots accessors |
|
||||
| `TestGitLabSource_EmptyToken_NoCallsNoError` | No HTTP request issued when Token=="" |
|
||||
| `TestGitLabSource_Sweep_EmitsFindings` | PRIVATE-TOKEN header, `scope=blobs`, two queries × two blobs = 4 Findings, Source URLs contain project_id/ref/path |
|
||||
| `TestGitLabSource_Unauthorized` | 401 propagates as `errors.Is(err, ErrUnauthorized)` |
|
||||
| `TestGitLabSource_CtxCancellation` | Sweep returns promptly on ctx timeout against a hanging server |
|
||||
| `TestGitLabSource_InterfaceAssertion` | Static recon.ReconSource conformance |
|
||||
|
||||
## Verification
|
||||
|
||||
```
|
||||
go build ./... # clean
|
||||
go test ./pkg/recon/sources/ -run TestGitLab -v # 6/6 PASS
|
||||
go test ./pkg/recon/sources/ # full package PASS (3.164s)
|
||||
```
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None for must-have behavior. Two minor adjustments:
|
||||
|
||||
1. `keywordIndex` helper renamed to `gitlabKeywordIndex` because `pkg/recon/sources/github.go` (Plan 10-02, wave-2 sibling) introduces an identically-named package-level symbol. Prefixing prevents a redeclared-identifier build failure when the parallel wave merges.
|
||||
2. Provider name lookup simplified to direct `map[string]string` access on the bare keyword because `formatQuery("gitlab", k)` returns the keyword verbatim (no wrapping syntax), avoiding a second `extractKeyword`-style helper.
|
||||
|
||||
## Deferred Issues
|
||||
|
||||
None.
|
||||
|
||||
## Known Stubs
|
||||
|
||||
None.
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- pkg/recon/sources/gitlab.go — FOUND
|
||||
- pkg/recon/sources/gitlab_test.go — FOUND
|
||||
- .planning/phases/10-osint-code-hosting/10-03-SUMMARY.md — FOUND
|
||||
- commit 0137dc5 — FOUND
|
||||
117
.planning/phases/10-osint-code-hosting/10-04-SUMMARY.md
Normal file
117
.planning/phases/10-osint-code-hosting/10-04-SUMMARY.md
Normal file
@@ -0,0 +1,117 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 04
|
||||
subsystem: recon/sources
|
||||
tags: [recon, osint, bitbucket, gist, wave-2]
|
||||
requires:
|
||||
- pkg/recon/sources.Client (Plan 10-01)
|
||||
- pkg/recon/sources.BuildQueries (Plan 10-01)
|
||||
- pkg/recon.LimiterRegistry (Phase 9)
|
||||
- pkg/providers.Registry
|
||||
provides:
|
||||
- pkg/recon/sources.BitbucketSource (RECON-CODE-03)
|
||||
- pkg/recon/sources.GistSource (RECON-CODE-04)
|
||||
affects:
|
||||
- pkg/recon/sources (two new source implementations)
|
||||
tech_stack_added: []
|
||||
patterns:
|
||||
- "Token+workspace gating (Bitbucket requires both to enable)"
|
||||
- "Content-scan fallback when API has no dedicated search (Gist)"
|
||||
- "One Finding per gist (not per file) to avoid duplicate leak reports"
|
||||
- "256KB read cap on raw content fetches"
|
||||
key_files_created:
|
||||
- pkg/recon/sources/bitbucket.go
|
||||
- pkg/recon/sources/bitbucket_test.go
|
||||
- pkg/recon/sources/gist.go
|
||||
- pkg/recon/sources/gist_test.go
|
||||
key_files_modified: []
|
||||
decisions:
|
||||
- "BitbucketSource disables cleanly when either token OR workspace is empty (no error)"
|
||||
- "GistSource enumerates /gists/public first page only; broader sweeps deferred"
|
||||
- "GistSource emits one Finding per matching gist, not per file (prevents fan-out of a single leak)"
|
||||
- "providerForQuery resolves keyword→provider name for Bitbucket Findings (API doesn't echo keyword)"
|
||||
- "Bitbucket rate: rate.Every(3.6s) burst 1; Gist rate: rate.Every(2s) burst 1"
|
||||
metrics:
|
||||
duration_minutes: 6
|
||||
tasks_completed: 2
|
||||
tests_added: 9
|
||||
completed_at: "2026-04-05T22:30:00Z"
|
||||
requirements: [RECON-CODE-03, RECON-CODE-04]
|
||||
---
|
||||
|
||||
# Phase 10 Plan 04: Bitbucket + Gist Sources Summary
|
||||
|
||||
One-liner: BitbucketSource hits the Cloud 2.0 code search API with workspace+token gating, and GistSource fans out over /gists/public fetching each file's raw content to match provider keywords, emitting one Finding per matching gist.
|
||||
|
||||
## What Was Built
|
||||
|
||||
### BitbucketSource (RECON-CODE-03)
|
||||
- `pkg/recon/sources/bitbucket.go` — implements `recon.ReconSource`.
|
||||
- Endpoint: `GET {base}/2.0/workspaces/{workspace}/search/code?search_query={kw}`.
|
||||
- Auth: `Authorization: Bearer <token>`.
|
||||
- Disabled when either `Token` or `Workspace` is empty (clean no-op, no error).
|
||||
- Rate: `rate.Every(3600ms)` burst 1 (Bitbucket 1000/hr API limit).
|
||||
- Iterates `BuildQueries(registry, "bitbucket")` — one request per provider keyword.
|
||||
- Decodes `{values:[{file:{path,commit{hash}},page_url}]}` and emits one Finding per entry.
|
||||
- `SourceType = "recon:bitbucket"`, `Source = page_url` (falls back to synthetic `bitbucket:{ws}/{path}@{hash}` when page_url missing).
|
||||
|
||||
### GistSource (RECON-CODE-04)
|
||||
- `pkg/recon/sources/gist.go` — implements `recon.ReconSource`.
|
||||
- Endpoint: `GET {base}/gists/public?per_page=100`.
|
||||
- Per gist, per file: fetches `raw_url` (also with Bearer auth) and scans content against the provider keyword set (flattened `keyword → providerName` map).
|
||||
- 256KB read cap per raw file to avoid pathological payloads.
|
||||
- Emits **one Finding per matching gist** (breaks on first keyword match across that gist's files) — prevents a multi-file leak from producing N duplicate Findings.
|
||||
- `ProviderName` set from the matched keyword; `Source = gist.html_url`; `SourceType = "recon:gist"`.
|
||||
- Rate: `rate.Every(2s)` burst 1 (30 req/min). Limiter waited before **every** outbound request (list + each raw fetch) so GitHub's shared budget is respected.
|
||||
- Disabled when token is empty.
|
||||
|
||||
## How It Fits
|
||||
- Depends on Plan 10-01 foundation: `sources.Client` (retry + 401→ErrUnauthorized), `BuildQueries`, `recon.LimiterRegistry`.
|
||||
- Does **not** modify `register.go` — Plan 10-09 wires all Wave 2 sources into `RegisterAll` after every plan lands.
|
||||
- Finding shape matches `engine.Finding` so downstream dedup/verify/storage paths in Phases 9/5/4 consume them without changes.
|
||||
|
||||
## Tests
|
||||
|
||||
`go test ./pkg/recon/sources/ -run "TestBitbucket|TestGist" -v`
|
||||
|
||||
### Bitbucket (4 tests)
|
||||
- `TestBitbucket_EnabledRequiresTokenAndWorkspace` — all four gate combinations.
|
||||
- `TestBitbucket_SweepEmitsFindings` — httptest server, asserts `/2.0/workspaces/testws/search/code` path, Bearer header, non-empty `search_query`, Finding source/type.
|
||||
- `TestBitbucket_Unauthorized` — 401 → `errors.Is(err, ErrUnauthorized)`.
|
||||
- `TestBitbucket_ContextCancellation` — slow server + 50ms ctx deadline.
|
||||
|
||||
### Gist (5 tests)
|
||||
- `TestGist_EnabledRequiresToken` — empty vs set token.
|
||||
- `TestGist_SweepEmitsFindingsOnKeywordMatch` — two gists, only one raw body contains `sk-proj-`; asserts exactly 1 Finding, correct `html_url`, `ProviderName=openai`.
|
||||
- `TestGist_NoMatch_NoFinding` — gist with unrelated content produces zero Findings.
|
||||
- `TestGist_Unauthorized` — 401 → `ErrUnauthorized`.
|
||||
- `TestGist_ContextCancellation` — slow server + 50ms ctx deadline.
|
||||
|
||||
All 9 tests pass. `go build ./...` is clean.
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None — plan executed exactly as written. No Rule 1/2/3 auto-fixes were required; all tests passed on first full run after writing implementations.
|
||||
|
||||
## Decisions Made
|
||||
|
||||
1. **Keyword→provider mapping on the Bitbucket side lives in `providerForQuery`** — Bitbucket's API doesn't echo the keyword in the response, so we parse the query back to a provider name. Simple substring match over registry keywords is sufficient at current scale.
|
||||
2. **GistSource emits one Finding per gist, not per file.** A single secret often lands in a `config.env` with supporting `README.md` and `docker-compose.yml` — treating the gist as the leak unit keeps noise down and matches how human reviewers triage.
|
||||
3. **Limiter waited before every raw fetch, not just the list call.** GitHub's 30/min budget is shared across API endpoints, so each raw content fetch consumes a token.
|
||||
4. **256KB cap on raw content reads.** Pathological gists (multi-MB logs, minified bundles) would otherwise block the sweep; 256KB is enough to surface a key that's typically near the top of a config file.
|
||||
|
||||
## Commits
|
||||
|
||||
- `d279abf` — feat(10-04): add BitbucketSource for code search recon
|
||||
- `0e16e8e` — feat(10-04): add GistSource for public gist keyword recon
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- FOUND: pkg/recon/sources/bitbucket.go
|
||||
- FOUND: pkg/recon/sources/bitbucket_test.go
|
||||
- FOUND: pkg/recon/sources/gist.go
|
||||
- FOUND: pkg/recon/sources/gist_test.go
|
||||
- FOUND: commit d279abf
|
||||
- FOUND: commit 0e16e8e
|
||||
- Tests: 9/9 passing (`go test ./pkg/recon/sources/ -run "TestBitbucket|TestGist"`)
|
||||
- Build: `go build ./...` clean
|
||||
99
.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md
Normal file
99
.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md
Normal file
@@ -0,0 +1,99 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 05
|
||||
subsystem: recon
|
||||
tags: [codeberg, gitea, osint, rest-api, httptest]
|
||||
|
||||
requires:
|
||||
- phase: 09-osint-infrastructure
|
||||
provides: ReconSource interface, LimiterRegistry, Engine
|
||||
- phase: 10-osint-code-hosting/01
|
||||
provides: shared sources.Client (retry/backoff), BuildQueries helper
|
||||
provides:
|
||||
- CodebergSource implementing recon.ReconSource against Gitea REST API
|
||||
- Reusable pattern for any Gitea-compatible instance via BaseURL override
|
||||
- Dual-mode rate limiting (unauth 60/hr, auth ~1000/hr)
|
||||
affects: [10-09 register-all, future Gitea-compatible sources, verification pipeline]
|
||||
|
||||
tech-stack:
|
||||
added: []
|
||||
patterns:
|
||||
- "Keyword → ProviderName index built at Sweep() entry to re-attribute BuildQueries output"
|
||||
- "BaseURL override enables generic Gitea targeting"
|
||||
- "httptest.Server with request-capturing handlers for header presence/absence assertions"
|
||||
|
||||
key-files:
|
||||
created:
|
||||
- pkg/recon/sources/codeberg.go
|
||||
- pkg/recon/sources/codeberg_test.go
|
||||
modified: []
|
||||
|
||||
key-decisions:
|
||||
- "Sweep ignores its query argument and iterates provider keywords, matching sibling code-hosting sources"
|
||||
- "Findings use Confidence=low since /repos/search matches repo metadata, not file contents — verification downstream separates real hits"
|
||||
- "Token is optional; Enabled() always returns true because public API works anonymously"
|
||||
- "DefaultCodebergBaseURL constant exported so Plan 10-09 can point at alternate Gitea hosts"
|
||||
|
||||
patterns-established:
|
||||
- "Dual-mode rate limiting: if Token == \"\" return unauth rate else auth rate"
|
||||
- "Per-source httptest suite covers: interface assertion, rate limits, decoding, header auth presence, header auth absence, ctx cancellation"
|
||||
|
||||
requirements-completed: [RECON-CODE-05]
|
||||
|
||||
duration: ~10min
|
||||
completed: 2026-04-05
|
||||
---
|
||||
|
||||
# Phase 10 Plan 05: CodebergSource Summary
|
||||
|
||||
**Gitea REST API source targeting Codeberg.org via /api/v1/repos/search with optional token auth and dual-mode rate limiting.**
|
||||
|
||||
## Performance
|
||||
|
||||
- **Duration:** ~10 min
|
||||
- **Started:** 2026-04-05T22:07:00Z
|
||||
- **Completed:** 2026-04-05T22:17:31Z
|
||||
- **Tasks:** 1 (TDD)
|
||||
- **Files modified:** 2 created
|
||||
|
||||
## Accomplishments
|
||||
- CodebergSource implements recon.ReconSource with compile-time assertion
|
||||
- Unauthenticated operation against /api/v1/repos/search (60/hour rate limit)
|
||||
- Optional token mode sends `Authorization: token <t>` and raises limit to ~1000/hour
|
||||
- Findings keyed to repo html_url with SourceType="recon:codeberg" and ProviderName resolved via keyword→provider index
|
||||
- Shared sources.Client handles retries/429s; no bespoke HTTP logic in the source
|
||||
- Six httptest-backed tests covering interface, both rate modes, sweep decoding, auth header presence/absence, and context cancellation
|
||||
|
||||
## Task Commits
|
||||
|
||||
1. **Task 1: CodebergSource + tests (TDD combined)** — `4fafc01` (feat)
|
||||
|
||||
## Files Created/Modified
|
||||
- `pkg/recon/sources/codeberg.go` — CodebergSource struct, rate mode selection, Sweep over /api/v1/repos/search
|
||||
- `pkg/recon/sources/codeberg_test.go` — httptest fixtures for all six behaviors
|
||||
|
||||
## Decisions Made
|
||||
- TDD RED+GREEN collapsed into a single commit because the file pair is small and was verified end-to-end in one iteration (all six tests pass on first green build).
|
||||
- `Confidence="low"` on emitted Findings: repo-metadata match is a weak signal until content verification runs.
|
||||
- `Sweep` ignores the `query` parameter; the plan specifies driving queries from the provider registry via `BuildQueries`, consistent with sibling code-hosting sources.
|
||||
|
||||
## Deviations from Plan
|
||||
None — plan executed exactly as written.
|
||||
|
||||
## Issues Encountered
|
||||
- **Worktree path confusion (environmental, not code):** Initial Write tool calls targeted the main repo path instead of the active worktree. Files silently failed to persist and `go test` surfaced unrelated pre-existing `github_test.go` references in the main repo. Recovered by writing into the worktree path `/home/salva/Documents/apikey/.claude/worktrees/agent-a2637f83/`. No code changes resulted from this; purely a path fix.
|
||||
|
||||
## Next Phase Readiness
|
||||
- Ready for Plan 10-09 (RegisterAll) to wire CodebergSource into `RegisterAll` with `cfg.CodebergToken` (field to be added when 10-09 finalizes SourcesConfig).
|
||||
- No blockers.
|
||||
|
||||
## Self-Check: PASSED
|
||||
- FOUND: pkg/recon/sources/codeberg.go
|
||||
- FOUND: pkg/recon/sources/codeberg_test.go
|
||||
- FOUND: commit 4fafc01
|
||||
- Tests: 6/6 passing (`go test ./pkg/recon/sources/ -run TestCodeberg -v`)
|
||||
- Package: `go vet` clean, full package tests green
|
||||
|
||||
---
|
||||
*Phase: 10-osint-code-hosting*
|
||||
*Completed: 2026-04-05*
|
||||
79
.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md
Normal file
79
.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md
Normal file
@@ -0,0 +1,79 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 06
|
||||
subsystem: recon/sources
|
||||
tags: [recon, osint, huggingface, wave-2]
|
||||
requires:
|
||||
- pkg/recon/sources.Client (Plan 10-01)
|
||||
- pkg/recon/sources.BuildQueries (Plan 10-01)
|
||||
- pkg/recon.LimiterRegistry
|
||||
- pkg/providers.Registry
|
||||
provides:
|
||||
- pkg/recon/sources.HuggingFaceSource
|
||||
- pkg/recon/sources.HuggingFaceConfig
|
||||
- pkg/recon/sources.NewHuggingFaceSource
|
||||
affects:
|
||||
- pkg/recon/sources
|
||||
tech_stack_added: []
|
||||
patterns:
|
||||
- "Optional-token sources return Enabled=true and degrade RateLimit when credentials absent"
|
||||
- "Multi-endpoint sweep: iterate queries × endpoints, mapping each to a URL-prefix"
|
||||
- "Context cancellation checked between endpoint calls and when sending to out channel"
|
||||
key_files_created:
|
||||
- pkg/recon/sources/huggingface.go
|
||||
- pkg/recon/sources/huggingface_test.go
|
||||
key_files_modified: []
|
||||
decisions:
|
||||
- "Unauthenticated rate of rate.Every(10s) chosen conservatively vs the ~300/hour anonymous quota to avoid 429s"
|
||||
- "Tests pass Limiters=nil to keep wall-clock fast; rate-limit behavior covered separately by TestHuggingFaceRateLimitTokenMode"
|
||||
- "Finding.Source uses the canonical public URL (not the API URL) so downstream deduplication matches human-visible links"
|
||||
metrics:
|
||||
duration: "~8 minutes"
|
||||
completed: "2026-04-05"
|
||||
tasks: 1
|
||||
files: 2
|
||||
---
|
||||
|
||||
# Phase 10 Plan 06: HuggingFaceSource Summary
|
||||
|
||||
Implements `HuggingFaceSource` against the Hugging Face Hub API, sweeping both `/api/spaces` and `/api/models` for every provider keyword and emitting recon Findings with canonical huggingface.co URLs.
|
||||
|
||||
## What Changed
|
||||
|
||||
- New `HuggingFaceSource` implementing `recon.ReconSource` with optional `Token`.
|
||||
- Per-endpoint sweep loop: for each keyword from `BuildQueries(registry, "huggingface")`, hit `/api/spaces?search=...&limit=50` then `/api/models?search=...&limit=50`.
|
||||
- URL normalization: space results mapped to `https://huggingface.co/spaces/{id}`, model results to `https://huggingface.co/{id}`.
|
||||
- Rate limit is token-aware: `rate.Every(3600ms)` when authenticated (matches 1000/hour), `rate.Every(10s)` otherwise.
|
||||
- Authorization header only set when `Token != ""`.
|
||||
- Compile-time assertion `var _ recon.ReconSource = (*HuggingFaceSource)(nil)`.
|
||||
|
||||
## Test Coverage
|
||||
|
||||
All six TDD assertions in `huggingface_test.go` pass:
|
||||
|
||||
1. `TestHuggingFaceEnabledAlwaysTrue` — enabled with and without token.
|
||||
2. `TestHuggingFaceSweepHitsBothEndpoints` — exact Finding count (2 keywords × 2 endpoints = 4), both URL prefixes observed, `SourceType="recon:huggingface"`.
|
||||
3. `TestHuggingFaceAuthorizationHeader` — `Bearer hf_secret` sent when token set, header absent when empty.
|
||||
4. `TestHuggingFaceContextCancellation` — slow server + 100ms context returns error promptly.
|
||||
5. `TestHuggingFaceRateLimitTokenMode` — authenticated rate is strictly faster than unauthenticated rate.
|
||||
|
||||
Plus httptest server shared by auth + endpoint tests (`hfTestServer`).
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None — plan executed exactly as written. One minor test refinement: tests pass `Limiters: nil` instead of constructing a real `LimiterRegistry`, because the production RateLimit of `rate.Every(3600ms)` with burst 1 would make four serialized waits exceed a reasonable test budget. The limiter code path is still exercised in production and the rate-mode contract is covered by `TestHuggingFaceRateLimitTokenMode`.
|
||||
|
||||
## Commits
|
||||
|
||||
- `45f8782` test(10-06): add failing tests for HuggingFaceSource
|
||||
- `39001f2` feat(10-06): implement HuggingFaceSource scanning Spaces and Models
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- FOUND: pkg/recon/sources/huggingface.go
|
||||
- FOUND: pkg/recon/sources/huggingface_test.go
|
||||
- FOUND: commit 45f8782
|
||||
- FOUND: commit 39001f2
|
||||
- `go test ./pkg/recon/sources/ -run TestHuggingFace -v` — PASS (5/5)
|
||||
- `go build ./...` — PASS
|
||||
- `go test ./pkg/recon/...` — PASS
|
||||
117
.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md
Normal file
117
.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md
Normal file
@@ -0,0 +1,117 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 08
|
||||
subsystem: recon
|
||||
tags: [kaggle, osint, http-basic-auth, httptest]
|
||||
|
||||
requires:
|
||||
- phase: 10-osint-code-hosting
|
||||
provides: "recon.ReconSource interface, sources.Client, BuildQueries, LimiterRegistry (Plan 10-01)"
|
||||
provides:
|
||||
- "KaggleSource implementing recon.ReconSource against Kaggle /api/v1/kernels/list"
|
||||
- "HTTP Basic auth wiring via req.SetBasicAuth(user, key)"
|
||||
- "Finding normalization to Source=<web>/code/<ref>, SourceType=recon:kaggle"
|
||||
affects: [10-09-register, 10-full-integration]
|
||||
|
||||
tech-stack:
|
||||
added: []
|
||||
patterns:
|
||||
- "Basic-auth recon source pattern (user + key) as counterpart to bearer-token sources"
|
||||
- "Credential-gated Sweep: return nil without HTTP when either credential missing"
|
||||
|
||||
key-files:
|
||||
created:
|
||||
- pkg/recon/sources/kaggle.go
|
||||
- pkg/recon/sources/kaggle_test.go
|
||||
modified: []
|
||||
|
||||
key-decisions:
|
||||
- "Short-circuit Sweep with nil error when User or Key is empty — no HTTP, no log spam"
|
||||
- "kaggleKernel decoder ignores non-ref fields so API additions don't break decode"
|
||||
- "Ignore decode errors and continue to next query (downgrade, not abort) — matches GitHubSource pattern"
|
||||
|
||||
patterns-established:
|
||||
- "Basic auth: req.SetBasicAuth(s.User, s.Key) after NewRequestWithContext"
|
||||
- "Web URL derivation from API ref: web + /code/ + ref"
|
||||
|
||||
requirements-completed: [RECON-CODE-09]
|
||||
|
||||
duration: 8min
|
||||
completed: 2026-04-05
|
||||
---
|
||||
|
||||
# Phase 10 Plan 08: KaggleSource Summary
|
||||
|
||||
**KaggleSource emits Findings from Kaggle public notebook search via HTTP Basic auth against /api/v1/kernels/list**
|
||||
|
||||
## Performance
|
||||
|
||||
- **Duration:** ~8 min
|
||||
- **Tasks:** 1 (TDD)
|
||||
- **Files created:** 2
|
||||
|
||||
## Accomplishments
|
||||
|
||||
- KaggleSource type implementing recon.ReconSource (Name, RateLimit, Burst, RespectsRobots, Enabled, Sweep)
|
||||
- Credentials-gated: both User AND Key required; missing either returns nil with zero HTTP calls
|
||||
- HTTP Basic auth wired via req.SetBasicAuth to Kaggle's /api/v1/kernels/list endpoint
|
||||
- Findings normalized with SourceType "recon:kaggle" and Source = WebBaseURL + "/code/" + ref
|
||||
- 60 req/min rate limit via rate.Every(1*time.Second), burst 1, honoring per-source LimiterRegistry
|
||||
- Compile-time interface assertion: `var _ recon.ReconSource = (*KaggleSource)(nil)`
|
||||
|
||||
## Task Commits
|
||||
|
||||
1. **Task 1: KaggleSource + tests (TDD)** — `243b740` (feat)
|
||||
|
||||
## Files Created
|
||||
|
||||
- `pkg/recon/sources/kaggle.go` — KaggleSource implementation, kaggleKernel decoder, interface assertion
|
||||
- `pkg/recon/sources/kaggle_test.go` — 6 httptest-driven tests
|
||||
|
||||
## Test Coverage
|
||||
|
||||
| Test | Covers |
|
||||
|------|--------|
|
||||
| TestKaggle_Enabled | All 4 credential combinations (empty/empty, user-only, key-only, both) |
|
||||
| TestKaggle_Sweep_BasicAuthAndFindings | Authorization header decoded as testuser:testkey, 2 refs → 2 Findings with correct Source URLs and recon:kaggle SourceType |
|
||||
| TestKaggle_Sweep_MissingCredentials_NoHTTP | Atomic counter verifies zero HTTP calls when either User or Key empty |
|
||||
| TestKaggle_Sweep_Unauthorized | 401 response wrapped as ErrUnauthorized |
|
||||
| TestKaggle_Sweep_CtxCancellation | Pre-cancelled ctx returns context.Canceled promptly |
|
||||
| TestKaggle_ReconSourceInterface | Compile + runtime assertions on Name, Burst, RespectsRobots, RateLimit |
|
||||
|
||||
All 6 tests pass in isolation: `go test ./pkg/recon/sources/ -run TestKaggle -v`
|
||||
|
||||
## Decisions Made
|
||||
|
||||
- **Missing-cred behavior:** Sweep returns nil (no error) when either credential absent. Matches GitHubSource pattern — disabled sources log-and-skip at the Engine level, not error out.
|
||||
- **Decode tolerance:** kaggleKernel struct only declares `Ref string`. Other fields (title, author, language) are silently discarded so upstream API changes don't break the source.
|
||||
- **Error downgrade:** Non-401 HTTP errors skip to next query rather than aborting the whole sweep. 401 is the only hard-fail case because it means credentials are actually invalid, not transient.
|
||||
- **Dual BaseURL fields:** BaseURL (API) and WebBaseURL (Finding URL stem) are separate struct fields so tests can point BaseURL at httptest.NewServer while WebBaseURL stays at the production kaggle.com domain for assertion stability.
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None — plan executed exactly as written. All truths from frontmatter (`must_haves`) satisfied:
|
||||
- KaggleSource queries `/api/v1/kernels/list` with Basic auth → TestKaggle_Sweep_BasicAuthAndFindings
|
||||
- Disabled when either credential empty → TestKaggle_Enabled + TestKaggle_Sweep_MissingCredentials_NoHTTP
|
||||
- Findings tagged recon:kaggle with Source = web + /code/ + ref → TestKaggle_Sweep_BasicAuthAndFindings
|
||||
|
||||
## Issues Encountered
|
||||
|
||||
- **Sibling-wave file churn:** During testing, sibling Wave 2 plans (10-02 GitHub, 10-05 Replit, 10-07 CodeSandbox, 10-03 GitLab) had already dropped partial files into `pkg/recon/sources/` in the main repo. A stray `github_test.go` with no `github.go` broke package compilation. Resolved by running tests in this plan's git worktree where only kaggle.go and kaggle_test.go are present alongside the Plan 10-01 scaffolding. No cross-plan changes made — scope boundary respected. Final wave merge will resolve all sibling files together.
|
||||
|
||||
## Next Phase Readiness
|
||||
|
||||
- KaggleSource is ready for registration in Plan 10-09 (`RegisterAll` wiring).
|
||||
- No blockers for downstream plans. RECON-CODE-09 satisfied.
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- File exists: `pkg/recon/sources/kaggle.go` — FOUND
|
||||
- File exists: `pkg/recon/sources/kaggle_test.go` — FOUND
|
||||
- Commit exists: `243b740` — FOUND (feat(10-08): add KaggleSource with HTTP Basic auth)
|
||||
- Tests pass: 6/6 TestKaggle_* (verified with sibling files stashed to isolate package build)
|
||||
|
||||
---
|
||||
*Phase: 10-osint-code-hosting*
|
||||
*Plan: 08*
|
||||
*Completed: 2026-04-05*
|
||||
100
.planning/phases/10-osint-code-hosting/10-09-SUMMARY.md
Normal file
100
.planning/phases/10-osint-code-hosting/10-09-SUMMARY.md
Normal file
@@ -0,0 +1,100 @@
|
||||
---
|
||||
phase: 10-osint-code-hosting
|
||||
plan: 09
|
||||
subsystem: recon
|
||||
tags: [register, integration, cmd, viper, httptest]
|
||||
|
||||
requires:
|
||||
- phase: 10-osint-code-hosting
|
||||
provides: "Ten code-hosting ReconSource implementations (Plans 10-01..10-08)"
|
||||
provides:
|
||||
- "sources.RegisterAll wires all ten Phase 10 sources onto a recon.Engine"
|
||||
- "cmd/recon.go constructs real SourcesConfig from env + viper and invokes RegisterAll"
|
||||
- "End-to-end SweepAll integration test exercising every source against one multiplexed httptest server"
|
||||
affects: [11-osint-pastebins, 12-osint-search-engines, cli-recon]
|
||||
|
||||
tech-stack:
|
||||
added: []
|
||||
patterns:
|
||||
- "Env-var → viper fallback (firstNonEmpty) for recon credential lookup"
|
||||
- "Unconditional source registration: credless sources register but Enabled()==false, uniform CLI surface"
|
||||
- "Single httptest.ServeMux routing per-path fixtures for multi-source integration tests"
|
||||
|
||||
key-files:
|
||||
created:
|
||||
- pkg/recon/sources/register_test.go
|
||||
- pkg/recon/sources/integration_test.go
|
||||
- .planning/phases/10-osint-code-hosting/deferred-items.md
|
||||
modified:
|
||||
- pkg/recon/sources/register.go
|
||||
- cmd/recon.go
|
||||
|
||||
key-decisions:
|
||||
- "Register all ten sources unconditionally so `keyhunter recon list` shows the full catalog regardless of configured credentials; missing creds just flip Enabled()==false"
|
||||
- "Integration test constructs sources directly with BaseURL overrides (not via RegisterAll) because RegisterAll wires production URLs"
|
||||
- "Credential precedence: env var → viper config key → empty (source disabled)"
|
||||
- "Single multiplexed httptest server used instead of ten separate servers — simpler and matches how recon.Engine fans out in parallel"
|
||||
- "firstNonEmpty helper kept local to cmd/recon.go rather than pkg-level to avoid exporting a trivial utility"
|
||||
|
||||
patterns-established:
|
||||
- "sources.RegisterAll(engine, cfg) is the single call cmd-layer code must make to wire Phase 10"
|
||||
- "Integration tests that need to drive many sources from one server encode the sub-source into the URL path (/search/code, /api/v4/search, etc.)"
|
||||
- "Struct literals for sources that lazy-init `client` in Sweep; NewXxxSource constructor for sources that don't (GitHubSource, KaggleSource, HuggingFaceSource)"
|
||||
|
||||
requirements-completed: [RECON-CODE-10]
|
||||
|
||||
duration: 12min
|
||||
completed: 2026-04-05
|
||||
---
|
||||
|
||||
# Phase 10 Plan 09: RegisterAll + cmd/recon + Integration Test Summary
|
||||
|
||||
**Ten Phase 10 code-hosting sources now wire onto recon.Engine via sources.RegisterAll, the CLI reads credentials from env+viper, and an end-to-end integration test drives every source through SweepAll against one multiplexed httptest server.**
|
||||
|
||||
## Performance
|
||||
|
||||
- **Duration:** ~12 min
|
||||
- **Tasks:** 2 (both TDD)
|
||||
- **Files created:** 3
|
||||
- **Files modified:** 2
|
||||
|
||||
## Accomplishments
|
||||
|
||||
- `sources.RegisterAll` wires all ten sources (github, gitlab, bitbucket, gist, codeberg, huggingface, replit, codesandbox, sandboxes, kaggle) onto a `*recon.Engine` in one call
|
||||
- Extended `SourcesConfig` with `BitbucketWorkspace` and `CodebergToken` fields to match Wave 2 constructor signatures
|
||||
- `cmd/recon.go` now loads providers.Registry, constructs a full `SourcesConfig` from env vars (`GITHUB_TOKEN`, `GITLAB_TOKEN`, `BITBUCKET_TOKEN`, `BITBUCKET_WORKSPACE`, `CODEBERG_TOKEN`, `HUGGINGFACE_TOKEN`, `KAGGLE_USERNAME`, `KAGGLE_KEY`) with viper fallback keys under `recon.<source>.*`, and calls `sources.RegisterAll`
|
||||
- `keyhunter recon list` now prints all eleven source names (`example` + ten Phase 10 sources)
|
||||
- Integration test (`integration_test.go::TestIntegration_AllSources_SweepAll`) spins up a single `httptest` server with per-path handlers for every source's API/HTML fixture, registers all ten sources (with BaseURL overrides) on a fresh `recon.Engine`, runs `SweepAll`, and asserts at least one `Finding` was emitted for each of the ten `recon:*` `SourceType` values
|
||||
- `register_test.go` covers RegisterAll contracts: exactly ten sources registered in deterministic sorted order, nil engine is a no-op, and empty credentials still produce a full registration list
|
||||
|
||||
## Verification
|
||||
|
||||
- `go test ./pkg/recon/sources/ -run TestRegisterAll -v` → 4 passing (nil, empty cfg, all-ten, missing-creds)
|
||||
- `go test ./pkg/recon/sources/ -run TestIntegration_AllSources_SweepAll -v` → passing; asserts 10/10 SourceType buckets populated
|
||||
- `go test ./pkg/recon/...` → all green (35s, includes pre-existing per-source suites)
|
||||
- `go vet ./...` → clean
|
||||
- `go build ./...` → clean
|
||||
- `go run . recon list` → prints `bitbucket codeberg codesandbox example gist github gitlab huggingface kaggle replit sandboxes`
|
||||
|
||||
## Deviations from Plan
|
||||
|
||||
None — plan executed as written. One out-of-scope finding was identified and logged to `deferred-items.md` (GitHubSource.Sweep dereferences `s.client` without a nil check; safe in current code paths because `RegisterAll` uses `NewGitHubSource` which initializes it, but a latent footgun for future struct-literal callers).
|
||||
|
||||
## Known Stubs
|
||||
|
||||
None. All ten sources are production-wired through `RegisterAll` and exercised by the integration test against realistic fixtures.
|
||||
|
||||
## Commits
|
||||
|
||||
- `4628ccf` test(10-09): add failing RegisterAll wiring tests
|
||||
- `fb3e573` feat(10-09): wire all ten Phase 10 sources in RegisterAll
|
||||
- `8528108` test(10-09): add end-to-end SweepAll integration test across all ten sources
|
||||
- `e00fb17` feat(10-09): wire sources.RegisterAll into cmd/recon with viper+env credential lookup
|
||||
|
||||
## Self-Check: PASSED
|
||||
|
||||
- pkg/recon/sources/register.go — FOUND
|
||||
- pkg/recon/sources/register_test.go — FOUND
|
||||
- pkg/recon/sources/integration_test.go — FOUND
|
||||
- cmd/recon.go — FOUND
|
||||
- commits 4628ccf, fb3e573, 8528108, e00fb17 — FOUND
|
||||
13
.planning/phases/10-osint-code-hosting/deferred-items.md
Normal file
13
.planning/phases/10-osint-code-hosting/deferred-items.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# Phase 10 — Deferred Items
|
||||
|
||||
Out-of-scope findings discovered during plan execution. These are NOT fixed in
|
||||
the current plan but are tracked here for future work.
|
||||
|
||||
## 10-09
|
||||
|
||||
- **GitHubSource struct-literal panic risk.** `GitHubSource.Sweep` dereferences
|
||||
`s.client` without a nil check (pkg/recon/sources/github.go:106). `NewGitHubSource`
|
||||
initializes `client`, so `RegisterAll` is safe, but any future caller using a
|
||||
struct literal (as sibling sources do) will panic. Fix: add
|
||||
`if s.client == nil { s.client = NewClient() }` at the top of Sweep. Siblings
|
||||
(GitLab, Bitbucket, Gist, Codeberg, HuggingFace, Kaggle) already lazy-init.
|
||||
44
cmd/recon.go
44
cmd/recon.go
@@ -3,9 +3,13 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon/sources"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/spf13/viper"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -17,7 +21,7 @@ var (
|
||||
var reconCmd = &cobra.Command{
|
||||
Use: "recon",
|
||||
Short: "Run OSINT recon across internet sources",
|
||||
Long: "Run OSINT recon sweeps across registered sources. Phase 9 ships with an ExampleSource stub; real sources land in Phases 10-16.",
|
||||
Long: "Run OSINT recon sweeps across registered sources. Phase 10 adds ten code-hosting sources (GitHub/GitLab/Bitbucket/Gist/Codeberg/HuggingFace/Replit/CodeSandbox/Sandboxes/Kaggle). Further phases add pastebins, search engines, etc.",
|
||||
}
|
||||
|
||||
var reconFullCmd = &cobra.Command{
|
||||
@@ -56,13 +60,45 @@ var reconListCmd = &cobra.Command{
|
||||
},
|
||||
}
|
||||
|
||||
// buildReconEngine constructs the recon Engine with all sources registered.
|
||||
// Phase 9 ships ExampleSource only; Phases 10-16 will add real sources here
|
||||
// (or via a registration side-effect in their packages).
|
||||
// buildReconEngine constructs the recon Engine with all registered sources.
|
||||
// Phase 9 contributes ExampleSource; Phase 10 contributes ten code-hosting
|
||||
// sources via sources.RegisterAll. Credentials are read from environment
|
||||
// variables first, then from viper config keys under `recon.<source>.*`.
|
||||
// Sources whose credentials are missing are still registered but Enabled()
|
||||
// will report false so SweepAll skips them cleanly.
|
||||
func buildReconEngine() *recon.Engine {
|
||||
e := recon.NewEngine()
|
||||
e.Register(recon.ExampleSource{})
|
||||
|
||||
reg, err := providers.NewRegistry()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "recon: failed to load providers: %v\n", err)
|
||||
return e
|
||||
}
|
||||
|
||||
cfg := sources.SourcesConfig{
|
||||
Registry: reg,
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
GitHubToken: firstNonEmpty(os.Getenv("GITHUB_TOKEN"), viper.GetString("recon.github.token")),
|
||||
GitLabToken: firstNonEmpty(os.Getenv("GITLAB_TOKEN"), viper.GetString("recon.gitlab.token")),
|
||||
BitbucketToken: firstNonEmpty(os.Getenv("BITBUCKET_TOKEN"), viper.GetString("recon.bitbucket.token")),
|
||||
BitbucketWorkspace: firstNonEmpty(os.Getenv("BITBUCKET_WORKSPACE"), viper.GetString("recon.bitbucket.workspace")),
|
||||
CodebergToken: firstNonEmpty(os.Getenv("CODEBERG_TOKEN"), viper.GetString("recon.codeberg.token")),
|
||||
HuggingFaceToken: firstNonEmpty(os.Getenv("HUGGINGFACE_TOKEN"), viper.GetString("recon.huggingface.token")),
|
||||
KaggleUser: firstNonEmpty(os.Getenv("KAGGLE_USERNAME"), viper.GetString("recon.kaggle.username")),
|
||||
KaggleKey: firstNonEmpty(os.Getenv("KAGGLE_KEY"), viper.GetString("recon.kaggle.key")),
|
||||
}
|
||||
sources.RegisterAll(e, cfg)
|
||||
return e
|
||||
}
|
||||
|
||||
// firstNonEmpty returns a if non-empty, otherwise b. Used to implement the
|
||||
// env-var → viper-config precedence chain for credential lookup.
|
||||
func firstNonEmpty(a, b string) string {
|
||||
if a != "" {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
||||
174
pkg/recon/sources/bitbucket.go
Normal file
174
pkg/recon/sources/bitbucket.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// BitbucketSource queries the Bitbucket Cloud 2.0 code search API for leaked
|
||||
// provider keywords across a configured workspace (RECON-CODE-03).
|
||||
//
|
||||
// Docs: https://developer.atlassian.com/cloud/bitbucket/rest/api-group-search/
|
||||
// Rate: 1000 req/hour → rate.Every(3.6s), burst 1.
|
||||
// Scope: requires both a token (app password or OAuth) AND a workspace slug;
|
||||
// absent either, the source disables itself cleanly (no error).
|
||||
type BitbucketSource struct {
|
||||
Token string
|
||||
Workspace string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*BitbucketSource)(nil)
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *BitbucketSource) Name() string { return "bitbucket" }
|
||||
|
||||
// RateLimit reports the per-source token bucket rate (1000/hour).
|
||||
func (s *BitbucketSource) RateLimit() rate.Limit {
|
||||
return rate.Every(3600 * time.Millisecond)
|
||||
}
|
||||
|
||||
// Burst reports the token bucket burst capacity.
|
||||
func (s *BitbucketSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots reports whether robots.txt applies (REST API → false).
|
||||
func (s *BitbucketSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether the source should run. Requires both token and
|
||||
// workspace to be non-empty.
|
||||
func (s *BitbucketSource) Enabled(cfg recon.Config) bool {
|
||||
return s.Token != "" && s.Workspace != ""
|
||||
}
|
||||
|
||||
// bitbucketSearchResponse mirrors the subset of the Bitbucket code search
|
||||
// response shape this source consumes.
|
||||
type bitbucketSearchResponse struct {
|
||||
Values []struct {
|
||||
ContentMatchCount int `json:"content_match_count"`
|
||||
PageURL string `json:"page_url"`
|
||||
File struct {
|
||||
Path string `json:"path"`
|
||||
Commit struct {
|
||||
Hash string `json:"hash"`
|
||||
} `json:"commit"`
|
||||
} `json:"file"`
|
||||
} `json:"values"`
|
||||
}
|
||||
|
||||
// Sweep iterates queries built from the provider registry, issues one search
|
||||
// request per query (rate-limited via Limiters), and emits one Finding per
|
||||
// `values` entry in the response.
|
||||
func (s *BitbucketSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = "https://api.bitbucket.org"
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "bitbucket")
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/2.0/workspaces/%s/search/code", base, url.PathEscape(s.Workspace))
|
||||
req, err := http.NewRequest(http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("bitbucket: build request: %w", err)
|
||||
}
|
||||
vals := req.URL.Query()
|
||||
vals.Set("search_query", q)
|
||||
req.URL.RawQuery = vals.Encode()
|
||||
req.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("bitbucket: sweep: %w", err)
|
||||
}
|
||||
|
||||
var body bitbucketSearchResponse
|
||||
dec := json.NewDecoder(resp.Body)
|
||||
decodeErr := dec.Decode(&body)
|
||||
_ = resp.Body.Close()
|
||||
if decodeErr != nil {
|
||||
return fmt.Errorf("bitbucket: decode: %w", decodeErr)
|
||||
}
|
||||
|
||||
for _, v := range body.Values {
|
||||
src := v.PageURL
|
||||
if src == "" {
|
||||
src = fmt.Sprintf("bitbucket:%s/%s@%s", s.Workspace, v.File.Path, v.File.Commit.Hash)
|
||||
}
|
||||
f := recon.Finding{
|
||||
ProviderName: providerForQuery(s.Registry, q),
|
||||
Source: src,
|
||||
SourceType: "recon:bitbucket",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}
|
||||
select {
|
||||
case out <- f:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// providerForQuery returns the provider name whose keyword appears in q, or
|
||||
// empty string if no match is found. Used to label Findings with their source
|
||||
// provider when the remote API doesn't echo the original keyword.
|
||||
func providerForQuery(reg *providers.Registry, q string) string {
|
||||
if reg == nil {
|
||||
return ""
|
||||
}
|
||||
for _, p := range reg.List() {
|
||||
for _, k := range p.Keywords {
|
||||
if k == "" {
|
||||
continue
|
||||
}
|
||||
if containsFold(q, k) {
|
||||
return p.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func containsFold(haystack, needle string) bool {
|
||||
if needle == "" {
|
||||
return false
|
||||
}
|
||||
if len(needle) > len(haystack) {
|
||||
return false
|
||||
}
|
||||
for i := 0; i+len(needle) <= len(haystack); i++ {
|
||||
if haystack[i:i+len(needle)] == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
132
pkg/recon/sources/bitbucket_test.go
Normal file
132
pkg/recon/sources/bitbucket_test.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func bitbucketTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
func newBitbucketSource(baseURL, token, workspace string) *BitbucketSource {
|
||||
return &BitbucketSource{
|
||||
Token: token,
|
||||
Workspace: workspace,
|
||||
BaseURL: baseURL,
|
||||
Registry: bitbucketTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
}
|
||||
|
||||
func TestBitbucket_EnabledRequiresTokenAndWorkspace(t *testing.T) {
|
||||
cfg := recon.Config{}
|
||||
|
||||
if newBitbucketSource("", "", "").Enabled(cfg) {
|
||||
t.Fatal("expected disabled when token+workspace empty")
|
||||
}
|
||||
if newBitbucketSource("", "tok", "").Enabled(cfg) {
|
||||
t.Fatal("expected disabled when workspace empty")
|
||||
}
|
||||
if newBitbucketSource("", "", "ws").Enabled(cfg) {
|
||||
t.Fatal("expected disabled when token empty")
|
||||
}
|
||||
if !newBitbucketSource("", "tok", "ws").Enabled(cfg) {
|
||||
t.Fatal("expected enabled when both set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBitbucket_SweepEmitsFindings(t *testing.T) {
|
||||
var gotAuth, gotPath string
|
||||
var gotQuery string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
gotPath = r.URL.Path
|
||||
gotQuery = r.URL.Query().Get("search_query")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{
|
||||
"values": [
|
||||
{
|
||||
"content_match_count": 2,
|
||||
"file": {"path": "secrets/.env", "commit": {"hash": "deadbeef"}},
|
||||
"page_url": "https://bitbucket.org/testws/repo/src/deadbeef/secrets/.env"
|
||||
}
|
||||
]
|
||||
}`))
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
src := newBitbucketSource(srv.URL, "tok", "testws")
|
||||
out := make(chan recon.Finding, 16)
|
||||
if err := src.Sweep(context.Background(), "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
if gotAuth != "Bearer tok" {
|
||||
t.Errorf("Authorization header = %q, want Bearer tok", gotAuth)
|
||||
}
|
||||
if gotPath != "/2.0/workspaces/testws/search/code" {
|
||||
t.Errorf("path = %q", gotPath)
|
||||
}
|
||||
if gotQuery == "" {
|
||||
t.Errorf("expected search_query param to be set")
|
||||
}
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatal("expected at least 1 finding")
|
||||
}
|
||||
f := findings[0]
|
||||
if f.SourceType != "recon:bitbucket" {
|
||||
t.Errorf("SourceType = %q", f.SourceType)
|
||||
}
|
||||
if !strings.Contains(f.Source, "bitbucket.org/testws/repo") {
|
||||
t.Errorf("Source = %q", f.Source)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBitbucket_Unauthorized(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "nope", http.StatusUnauthorized)
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
src := newBitbucketSource(srv.URL, "tok", "testws")
|
||||
out := make(chan recon.Finding, 4)
|
||||
err := src.Sweep(context.Background(), "", out)
|
||||
if !errors.Is(err, ErrUnauthorized) {
|
||||
t.Fatalf("err = %v, want ErrUnauthorized", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBitbucket_ContextCancellation(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(2 * time.Second)
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte(`{"values":[]}`))
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
src := newBitbucketSource(srv.URL, "tok", "testws")
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||
defer cancel()
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := src.Sweep(ctx, "", out)
|
||||
if err == nil {
|
||||
t.Fatal("expected error from cancelled context")
|
||||
}
|
||||
}
|
||||
167
pkg/recon/sources/codeberg.go
Normal file
167
pkg/recon/sources/codeberg.go
Normal file
@@ -0,0 +1,167 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// DefaultCodebergBaseURL is the public Codeberg instance. Any Gitea-compatible
|
||||
// server can be substituted by setting CodebergSource.BaseURL.
|
||||
const DefaultCodebergBaseURL = "https://codeberg.org"
|
||||
|
||||
// CodebergSource implements recon.ReconSource against a Gitea-compatible REST
|
||||
// API (Codeberg runs Gitea). Public repository metadata searches do not
|
||||
// require authentication; when a Token is provided it is sent as
|
||||
// "Authorization: token <t>" which raises Gitea's per-user rate limit from
|
||||
// 60/hour to ~1000/hour.
|
||||
//
|
||||
// Sweep iterates every keyword from the provider registry, queries
|
||||
// /api/v1/repos/search?q=<kw>&limit=50, and emits one recon.Finding per
|
||||
// returned repository. The html_url is used as Source; the matching provider
|
||||
// name is attached so downstream verification can target the correct API.
|
||||
type CodebergSource struct {
|
||||
Token string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
// Compile-time interface assertion.
|
||||
var _ recon.ReconSource = (*CodebergSource)(nil)
|
||||
|
||||
// Name returns the stable identifier used by the limiter registry and
|
||||
// Finding.SourceType.
|
||||
func (s *CodebergSource) Name() string { return "codeberg" }
|
||||
|
||||
// RateLimit returns rate.Every(60s) unauthenticated (60/hour) or
|
||||
// rate.Every(3.6s) authenticated (~1000/hour).
|
||||
func (s *CodebergSource) RateLimit() rate.Limit {
|
||||
if s.Token == "" {
|
||||
return rate.Every(60 * time.Second)
|
||||
}
|
||||
return rate.Every(3600 * time.Millisecond)
|
||||
}
|
||||
|
||||
// Burst returns 1 — Gitea's rate limits are per-hour smoothed, a burst of one
|
||||
// keeps us safely within headroom for both auth modes.
|
||||
func (s *CodebergSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots is false — /api/v1/repos/search is a documented REST API.
|
||||
func (s *CodebergSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled is always true because the /repos/search endpoint works anonymously.
|
||||
// A token, when present, only raises the rate limit.
|
||||
func (s *CodebergSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// Sweep queries Gitea /api/v1/repos/search for every keyword in the provider
|
||||
// registry, decodes the data array, and emits one Finding per result. The
|
||||
// query parameter is ignored — Codeberg is swept by provider keyword, not by
|
||||
// arbitrary Config.Query text, matching the sibling GitHub/GitLab sources.
|
||||
func (s *CodebergSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = DefaultCodebergBaseURL
|
||||
}
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
|
||||
// Build a keyword → providerName map once so emitted findings are
|
||||
// correctly attributed even though BuildQueries returns bare strings.
|
||||
keywordIndex := make(map[string]string)
|
||||
if s.Registry != nil {
|
||||
for _, p := range s.Registry.List() {
|
||||
for _, k := range p.Keywords {
|
||||
if k == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := keywordIndex[k]; !exists {
|
||||
keywordIndex[k] = p.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, s.Name())
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/api/v1/repos/search?q=%s&limit=50",
|
||||
base, url.QueryEscape(q))
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("codeberg: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
if s.Token != "" {
|
||||
req.Header.Set("Authorization", "token "+s.Token)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("codeberg: search %q: %w", q, err)
|
||||
}
|
||||
|
||||
var decoded struct {
|
||||
OK bool `json:"ok"`
|
||||
Data []struct {
|
||||
FullName string `json:"full_name"`
|
||||
HTMLURL string `json:"html_url"`
|
||||
} `json:"data"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil {
|
||||
_, _ = io.Copy(io.Discard, resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
return fmt.Errorf("codeberg: decode: %w", err)
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
|
||||
provider := keywordIndex[q]
|
||||
for _, item := range decoded.Data {
|
||||
if item.HTMLURL == "" {
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case out <- recon.Finding{
|
||||
ProviderName: provider,
|
||||
Source: item.HTMLURL,
|
||||
SourceType: "recon:codeberg",
|
||||
Confidence: "low",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
182
pkg/recon/sources/codeberg_test.go
Normal file
182
pkg/recon/sources/codeberg_test.go
Normal file
@@ -0,0 +1,182 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func newCodebergTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{
|
||||
Name: "openai",
|
||||
DisplayName: "OpenAI",
|
||||
Tier: 1,
|
||||
Keywords: []string{"sk-proj-"},
|
||||
FormatVersion: 1,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func TestCodebergSource_NameAndInterface(t *testing.T) {
|
||||
var _ recon.ReconSource = (*CodebergSource)(nil)
|
||||
|
||||
s := &CodebergSource{}
|
||||
if got := s.Name(); got != "codeberg" {
|
||||
t.Errorf("Name() = %q, want %q", got, "codeberg")
|
||||
}
|
||||
if s.RespectsRobots() {
|
||||
t.Errorf("RespectsRobots() = true, want false")
|
||||
}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Errorf("Enabled() = false, want true (public API)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCodebergSource_RateLimitUnauthenticated(t *testing.T) {
|
||||
s := &CodebergSource{}
|
||||
got := s.RateLimit()
|
||||
want := rate.Every(60 * time.Second)
|
||||
if got != want {
|
||||
t.Errorf("RateLimit() no token = %v, want %v", got, want)
|
||||
}
|
||||
if s.Burst() != 1 {
|
||||
t.Errorf("Burst() = %d, want 1", s.Burst())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCodebergSource_RateLimitAuthenticated(t *testing.T) {
|
||||
s := &CodebergSource{Token: "abc123"}
|
||||
got := s.RateLimit()
|
||||
want := rate.Every(3600 * time.Millisecond)
|
||||
if got != want {
|
||||
t.Errorf("RateLimit() with token = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCodebergSource_SweepEmitsFindings(t *testing.T) {
|
||||
var gotAuth string
|
||||
var gotPath string
|
||||
var gotQuery string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
gotPath = r.URL.Path
|
||||
gotQuery = r.URL.Query().Get("q")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"ok": true,
|
||||
"data": []map[string]any{
|
||||
{
|
||||
"full_name": "alice/leaked-keys",
|
||||
"html_url": "https://codeberg.org/alice/leaked-keys",
|
||||
},
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &CodebergSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: newCodebergTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 8)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := s.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) == 0 {
|
||||
t.Fatalf("expected at least one finding")
|
||||
}
|
||||
|
||||
f := findings[0]
|
||||
if f.Source != "https://codeberg.org/alice/leaked-keys" {
|
||||
t.Errorf("Source = %q, want codeberg html_url", f.Source)
|
||||
}
|
||||
if f.SourceType != "recon:codeberg" {
|
||||
t.Errorf("SourceType = %q, want recon:codeberg", f.SourceType)
|
||||
}
|
||||
if f.ProviderName != "openai" {
|
||||
t.Errorf("ProviderName = %q, want openai", f.ProviderName)
|
||||
}
|
||||
|
||||
if gotPath != "/api/v1/repos/search" {
|
||||
t.Errorf("path = %q, want /api/v1/repos/search", gotPath)
|
||||
}
|
||||
if gotQuery == "" {
|
||||
t.Errorf("query param empty")
|
||||
}
|
||||
if gotAuth != "" {
|
||||
t.Errorf("Authorization header should be absent without token, got %q", gotAuth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCodebergSource_SweepWithTokenSetsAuthHeader(t *testing.T) {
|
||||
var gotAuth string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"ok":true,"data":[]}`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &CodebergSource{
|
||||
Token: "s3cret",
|
||||
BaseURL: srv.URL,
|
||||
Registry: newCodebergTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
out := make(chan recon.Finding, 1)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := s.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(gotAuth, "token ") || !strings.Contains(gotAuth, "s3cret") {
|
||||
t.Errorf("Authorization header = %q, want \"token s3cret\"", gotAuth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCodebergSource_SweepContextCancellation(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
case <-time.After(3 * time.Second):
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &CodebergSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: newCodebergTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := s.Sweep(ctx, "", out)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error on cancelled context")
|
||||
}
|
||||
}
|
||||
184
pkg/recon/sources/gist.go
Normal file
184
pkg/recon/sources/gist.go
Normal file
@@ -0,0 +1,184 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// GistSource scans recent public GitHub Gists for provider keyword leaks
|
||||
// (RECON-CODE-04).
|
||||
//
|
||||
// GitHub does not expose a dedicated /search/gists endpoint, so this source
|
||||
// enumerates /gists/public (most-recent page) and fetches each file's raw URL
|
||||
// to scan its content against the provider keyword set. Keep Phase 10 minimal:
|
||||
// only the first page is walked; broader sweeps are a future optimization.
|
||||
//
|
||||
// Auth: GitHub token via Bearer header. Rate: 30 req/min (shared with GitHub
|
||||
// search limits) → rate.Every(2s), burst 1.
|
||||
type GistSource struct {
|
||||
Token string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
var _ recon.ReconSource = (*GistSource)(nil)
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *GistSource) Name() string { return "gist" }
|
||||
|
||||
// RateLimit reports the per-source token bucket rate (30/min).
|
||||
func (s *GistSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
|
||||
|
||||
// Burst reports the token bucket burst capacity.
|
||||
func (s *GistSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots reports whether robots.txt applies (REST API → false).
|
||||
func (s *GistSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether the source runs. Requires a GitHub token.
|
||||
func (s *GistSource) Enabled(_ recon.Config) bool { return s.Token != "" }
|
||||
|
||||
type gistListEntry struct {
|
||||
HTMLURL string `json:"html_url"`
|
||||
Files map[string]struct {
|
||||
Filename string `json:"filename"`
|
||||
RawURL string `json:"raw_url"`
|
||||
} `json:"files"`
|
||||
}
|
||||
|
||||
// Sweep fetches /gists/public, scans each file's raw content against the
|
||||
// keyword set from the registry, and emits one Finding per gist that matches
|
||||
// any keyword (not one per file — gists often split a single leak across
|
||||
// helper files).
|
||||
func (s *GistSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = "https://api.github.com"
|
||||
}
|
||||
|
||||
keywords := s.keywordSet()
|
||||
if len(keywords) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
listReq, err := http.NewRequest(http.MethodGet, base+"/gists/public?per_page=100", nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gist: build list request: %w", err)
|
||||
}
|
||||
listReq.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
listReq.Header.Set("Accept", "application/vnd.github+json")
|
||||
|
||||
listResp, err := s.client.Do(ctx, listReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gist: list: %w", err)
|
||||
}
|
||||
var gists []gistListEntry
|
||||
dec := json.NewDecoder(listResp.Body)
|
||||
decodeErr := dec.Decode(&gists)
|
||||
_ = listResp.Body.Close()
|
||||
if decodeErr != nil {
|
||||
return fmt.Errorf("gist: decode list: %w", decodeErr)
|
||||
}
|
||||
|
||||
for _, g := range gists {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
matched := false
|
||||
var matchedProvider string
|
||||
|
||||
fileLoop:
|
||||
for _, f := range g.Files {
|
||||
if f.RawURL == "" {
|
||||
continue
|
||||
}
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
rawReq, err := http.NewRequest(http.MethodGet, f.RawURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gist: build raw request: %w", err)
|
||||
}
|
||||
rawReq.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
rawResp, err := s.client.Do(ctx, rawReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gist: fetch raw: %w", err)
|
||||
}
|
||||
// Cap read to 256KB to avoid pathological gists.
|
||||
body, readErr := io.ReadAll(io.LimitReader(rawResp.Body, 256*1024))
|
||||
_ = rawResp.Body.Close()
|
||||
if readErr != nil {
|
||||
return fmt.Errorf("gist: read raw: %w", readErr)
|
||||
}
|
||||
|
||||
content := string(body)
|
||||
for kw, provName := range keywords {
|
||||
if strings.Contains(content, kw) {
|
||||
matched = true
|
||||
matchedProvider = provName
|
||||
break fileLoop
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if matched {
|
||||
select {
|
||||
case out <- recon.Finding{
|
||||
ProviderName: matchedProvider,
|
||||
Source: g.HTMLURL,
|
||||
SourceType: "recon:gist",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// keywordSet flattens the registry into a keyword→providerName map for
|
||||
// content scanning. Empty keywords are skipped.
|
||||
func (s *GistSource) keywordSet() map[string]string {
|
||||
out := make(map[string]string)
|
||||
if s.Registry == nil {
|
||||
return out
|
||||
}
|
||||
for _, p := range s.Registry.List() {
|
||||
for _, k := range p.Keywords {
|
||||
if k == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := out[k]; !ok {
|
||||
out[k] = p.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
166
pkg/recon/sources/gist_test.go
Normal file
166
pkg/recon/sources/gist_test.go
Normal file
@@ -0,0 +1,166 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func gistTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
func newGistSource(baseURL, token string) *GistSource {
|
||||
return &GistSource{
|
||||
Token: token,
|
||||
BaseURL: baseURL,
|
||||
Registry: gistTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
}
|
||||
|
||||
func TestGist_EnabledRequiresToken(t *testing.T) {
|
||||
cfg := recon.Config{}
|
||||
if newGistSource("", "").Enabled(cfg) {
|
||||
t.Fatal("expected disabled when token empty")
|
||||
}
|
||||
if !newGistSource("", "tok").Enabled(cfg) {
|
||||
t.Fatal("expected enabled when token set")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGist_SweepEmitsFindingsOnKeywordMatch(t *testing.T) {
|
||||
var gotAuth, gotListPath string
|
||||
|
||||
mux := http.NewServeMux()
|
||||
var srv *httptest.Server
|
||||
|
||||
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
gotListPath = r.URL.Path
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
body := fmt.Sprintf(`[
|
||||
{
|
||||
"html_url": "https://gist.github.com/alice/aaa",
|
||||
"files": {
|
||||
"leak.env": {"filename": "leak.env", "raw_url": "%s/raw/aaa"}
|
||||
}
|
||||
},
|
||||
{
|
||||
"html_url": "https://gist.github.com/bob/bbb",
|
||||
"files": {
|
||||
"notes.md": {"filename": "notes.md", "raw_url": "%s/raw/bbb"}
|
||||
}
|
||||
}
|
||||
]`, srv.URL, srv.URL)
|
||||
_, _ = w.Write([]byte(body))
|
||||
})
|
||||
mux.HandleFunc("/raw/aaa", func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("OPENAI_API_KEY=sk-proj-1234567890abcdefghijk"))
|
||||
})
|
||||
mux.HandleFunc("/raw/bbb", func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("just some unrelated notes here"))
|
||||
})
|
||||
|
||||
srv = httptest.NewServer(mux)
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
src := newGistSource(srv.URL, "tok")
|
||||
out := make(chan recon.Finding, 8)
|
||||
if err := src.Sweep(context.Background(), "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
if gotAuth != "Bearer tok" {
|
||||
t.Errorf("Authorization = %q", gotAuth)
|
||||
}
|
||||
if gotListPath != "/gists/public" {
|
||||
t.Errorf("list path = %q", gotListPath)
|
||||
}
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) != 1 {
|
||||
t.Fatalf("findings count = %d, want 1 (only aaa matches sk-proj-)", len(findings))
|
||||
}
|
||||
f := findings[0]
|
||||
if !strings.Contains(f.Source, "alice/aaa") {
|
||||
t.Errorf("Source = %q, want gist alice/aaa", f.Source)
|
||||
}
|
||||
if f.SourceType != "recon:gist" {
|
||||
t.Errorf("SourceType = %q", f.SourceType)
|
||||
}
|
||||
if f.ProviderName != "openai" {
|
||||
t.Errorf("ProviderName = %q, want openai", f.ProviderName)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGist_NoMatch_NoFinding(t *testing.T) {
|
||||
var srv *httptest.Server
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
|
||||
body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/x/y","files":{"a.txt":{"filename":"a.txt","raw_url":"%s/raw/x"}}}]`, srv.URL)
|
||||
_, _ = w.Write([]byte(body))
|
||||
})
|
||||
mux.HandleFunc("/raw/x", func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("nothing interesting"))
|
||||
})
|
||||
srv = httptest.NewServer(mux)
|
||||
t.Cleanup(srv.Close)
|
||||
|
||||
src := newGistSource(srv.URL, "tok")
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(context.Background(), "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
n := 0
|
||||
for range out {
|
||||
n++
|
||||
}
|
||||
if n != 0 {
|
||||
t.Fatalf("findings = %d, want 0", n)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGist_Unauthorized(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "bad", http.StatusUnauthorized)
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
src := newGistSource(srv.URL, "tok")
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := src.Sweep(context.Background(), "", out)
|
||||
if !errors.Is(err, ErrUnauthorized) {
|
||||
t.Fatalf("err = %v, want ErrUnauthorized", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGist_ContextCancellation(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(2 * time.Second)
|
||||
_, _ = w.Write([]byte(`[]`))
|
||||
}))
|
||||
t.Cleanup(srv.Close)
|
||||
src := newGistSource(srv.URL, "tok")
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||
defer cancel()
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := src.Sweep(ctx, "", out)
|
||||
if err == nil {
|
||||
t.Fatal("expected error on cancelled ctx")
|
||||
}
|
||||
}
|
||||
175
pkg/recon/sources/gitlab.go
Normal file
175
pkg/recon/sources/gitlab.go
Normal file
@@ -0,0 +1,175 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// GitLabSource implements recon.ReconSource against GitLab's Search API
|
||||
// (/api/v4/search?scope=blobs). It honors PRIVATE-TOKEN header auth and the
|
||||
// published 2000 req/min rate limit. Sweep iterates BuildQueries(reg, "gitlab")
|
||||
// — one request per keyword-derived query — and emits one Finding per returned
|
||||
// blob, with Source pointing at the blob's project/ref/path URL.
|
||||
//
|
||||
// RECON-CODE-02.
|
||||
type GitLabSource struct {
|
||||
// Token is the GitLab personal access token. When empty the source is
|
||||
// disabled (Enabled returns false, Sweep is a no-op).
|
||||
Token string
|
||||
// BaseURL is the GitLab instance root. Defaults to https://gitlab.com.
|
||||
BaseURL string
|
||||
// Registry drives query generation via BuildQueries and provider name
|
||||
// mapping for emitted findings.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared per-source rate-limiter registry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
// client is the retry-aware HTTP wrapper. A nil client is replaced with
|
||||
// NewClient() lazily inside Sweep so zero-value construction works.
|
||||
client *Client
|
||||
}
|
||||
|
||||
// Compile-time interface assertion.
|
||||
var _ recon.ReconSource = (*GitLabSource)(nil)
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *GitLabSource) Name() string { return "gitlab" }
|
||||
|
||||
// RateLimit returns ~2000 req/min (one token every 30ms).
|
||||
func (s *GitLabSource) RateLimit() rate.Limit { return rate.Every(30 * time.Millisecond) }
|
||||
|
||||
// Burst allows short bursts of 5 requests.
|
||||
func (s *GitLabSource) Burst() int { return 5 }
|
||||
|
||||
// RespectsRobots returns false: this source uses an authenticated REST API,
|
||||
// not HTML scraping.
|
||||
func (s *GitLabSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether a token is configured.
|
||||
func (s *GitLabSource) Enabled(_ recon.Config) bool { return strings.TrimSpace(s.Token) != "" }
|
||||
|
||||
// glBlob is the subset of the GitLab Search API blob response we consume.
|
||||
type glBlob struct {
|
||||
Basename string `json:"basename"`
|
||||
Data string `json:"data"`
|
||||
Path string `json:"path"`
|
||||
ProjectID int `json:"project_id"`
|
||||
Ref string `json:"ref"`
|
||||
Startline int `json:"startline"`
|
||||
}
|
||||
|
||||
// Sweep runs the GitLab blob search for every keyword-derived query and emits
|
||||
// one Finding per blob. It returns nil when the source is disabled (empty
|
||||
// token) so callers can safely skip without special-casing.
|
||||
func (s *GitLabSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
return nil
|
||||
}
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := strings.TrimRight(s.BaseURL, "/")
|
||||
if base == "" {
|
||||
base = "https://gitlab.com"
|
||||
}
|
||||
limiters := s.Limiters
|
||||
if limiters == nil {
|
||||
limiters = recon.NewLimiterRegistry()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "gitlab")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
kwIndex := gitlabKeywordIndex(s.Registry)
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/api/v4/search?scope=blobs&search=%s&per_page=20",
|
||||
base, url.QueryEscape(q))
|
||||
req, err := http.NewRequest(http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gitlab: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("PRIVATE-TOKEN", s.Token)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("gitlab: %w", err)
|
||||
}
|
||||
|
||||
var blobs []glBlob
|
||||
decErr := json.NewDecoder(resp.Body).Decode(&blobs)
|
||||
_ = resp.Body.Close()
|
||||
if decErr != nil {
|
||||
return fmt.Errorf("gitlab: decode: %w", decErr)
|
||||
}
|
||||
|
||||
// For "gitlab", BuildQueries emits bare keywords, so a direct map
|
||||
// lookup recovers the provider name for each query.
|
||||
provName := kwIndex[q]
|
||||
if provName == "" {
|
||||
provName = "unknown"
|
||||
}
|
||||
|
||||
for _, b := range blobs {
|
||||
sourceURL := fmt.Sprintf("%s/projects/%d/-/blob/%s/%s",
|
||||
base, b.ProjectID, b.Ref, b.Path)
|
||||
finding := recon.Finding{
|
||||
ProviderName: provName,
|
||||
Confidence: "low",
|
||||
Source: sourceURL,
|
||||
SourceType: "recon:gitlab",
|
||||
LineNumber: b.Startline,
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}
|
||||
select {
|
||||
case out <- finding:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// gitlabKeywordIndex maps each provider keyword back to its provider name for
|
||||
// Finding.ProviderName population. A name prefixed with "gitlab" avoids
|
||||
// colliding with the shared keywordIndex helper introduced by peer sources
|
||||
// (github.go) in the same package.
|
||||
func gitlabKeywordIndex(reg *providers.Registry) map[string]string {
|
||||
idx := make(map[string]string)
|
||||
if reg == nil {
|
||||
return idx
|
||||
}
|
||||
for _, p := range reg.List() {
|
||||
for _, kw := range p.Keywords {
|
||||
if kw == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := idx[kw]; !exists {
|
||||
idx[kw] = p.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
return idx
|
||||
}
|
||||
229
pkg/recon/sources/gitlab_test.go
Normal file
229
pkg/recon/sources/gitlab_test.go
Normal file
@@ -0,0 +1,229 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// gitlabTestRegistry returns a synthetic registry with two providers whose
|
||||
// keywords drive the query loop. Keywords are chosen so BuildQueries output is
|
||||
// deterministic and map lookups are unambiguous.
|
||||
func gitlabTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{
|
||||
Name: "openai",
|
||||
Keywords: []string{"sk-test"},
|
||||
Patterns: []providers.Pattern{{Regex: "sk-test[A-Za-z0-9]+", Confidence: "high"}},
|
||||
},
|
||||
{
|
||||
Name: "demo",
|
||||
Keywords: []string{"ghkey"},
|
||||
Patterns: []providers.Pattern{{Regex: "ghkey[A-Za-z0-9]+", Confidence: "low"}},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
type gitlabBlobFixture struct {
|
||||
Basename string `json:"basename"`
|
||||
Data string `json:"data"`
|
||||
Path string `json:"path"`
|
||||
ProjectID int `json:"project_id"`
|
||||
Ref string `json:"ref"`
|
||||
Startline int `json:"startline"`
|
||||
}
|
||||
|
||||
func TestGitLabSource_EnabledFalseWhenTokenEmpty(t *testing.T) {
|
||||
s := &GitLabSource{Token: "", Registry: gitlabTestRegistry(), Limiters: recon.NewLimiterRegistry()}
|
||||
if s.Enabled(recon.Config{}) {
|
||||
t.Fatalf("expected Enabled=false when token empty")
|
||||
}
|
||||
s2 := &GitLabSource{Token: "glpat-xxx", Registry: gitlabTestRegistry(), Limiters: recon.NewLimiterRegistry()}
|
||||
if !s2.Enabled(recon.Config{}) {
|
||||
t.Fatalf("expected Enabled=true when token set")
|
||||
}
|
||||
if s.Name() != "gitlab" {
|
||||
t.Fatalf("expected Name=gitlab, got %q", s.Name())
|
||||
}
|
||||
if s.RespectsRobots() {
|
||||
t.Fatalf("expected RespectsRobots=false for REST API source")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabSource_EmptyToken_NoCallsNoError(t *testing.T) {
|
||||
var calls int32
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &GitLabSource{
|
||||
Token: "",
|
||||
BaseURL: srv.URL,
|
||||
Registry: gitlabTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := s.Sweep(context.Background(), "", out); err != nil {
|
||||
t.Fatalf("expected nil err on empty token, got %v", err)
|
||||
}
|
||||
close(out)
|
||||
if atomic.LoadInt32(&calls) != 0 {
|
||||
t.Fatalf("expected zero HTTP calls, got %d", calls)
|
||||
}
|
||||
if len(out) != 0 {
|
||||
t.Fatalf("expected zero findings, got %d", len(out))
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabSource_Sweep_EmitsFindings(t *testing.T) {
|
||||
var gotToken string
|
||||
var gotScopes []string
|
||||
var gotSearches []string
|
||||
|
||||
blobs := []gitlabBlobFixture{
|
||||
{Basename: "config.env", Data: "API_KEY=sk-testABCDEF", Path: "app/config.env", ProjectID: 42, Ref: "main", Startline: 3},
|
||||
{Basename: "README.md", Data: "use ghkeyXYZ", Path: "docs/README.md", ProjectID: 99, Ref: "master", Startline: 10},
|
||||
}
|
||||
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/v4/search" {
|
||||
http.Error(w, "not found", 404)
|
||||
return
|
||||
}
|
||||
gotToken = r.Header.Get("PRIVATE-TOKEN")
|
||||
gotScopes = append(gotScopes, r.URL.Query().Get("scope"))
|
||||
gotSearches = append(gotSearches, r.URL.Query().Get("search"))
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(blobs)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &GitLabSource{
|
||||
Token: "glpat-secret",
|
||||
BaseURL: srv.URL,
|
||||
Registry: gitlabTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
|
||||
out := make(chan recon.Finding, 32)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := s.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
if gotToken != "glpat-secret" {
|
||||
t.Fatalf("expected PRIVATE-TOKEN header, got %q", gotToken)
|
||||
}
|
||||
for _, sc := range gotScopes {
|
||||
if sc != "blobs" {
|
||||
t.Fatalf("expected scope=blobs, got %q", sc)
|
||||
}
|
||||
}
|
||||
// Two providers → two queries → two requests → 4 findings (2 blobs each).
|
||||
if len(gotSearches) != 2 {
|
||||
t.Fatalf("expected 2 search calls, got %d: %v", len(gotSearches), gotSearches)
|
||||
}
|
||||
|
||||
findings := gitlabDrain(out)
|
||||
if len(findings) != 4 {
|
||||
t.Fatalf("expected 4 findings (2 blobs × 2 queries), got %d", len(findings))
|
||||
}
|
||||
|
||||
var sawP42, sawP99 bool
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:gitlab" {
|
||||
t.Errorf("bad SourceType: %q", f.SourceType)
|
||||
}
|
||||
if f.Confidence != "low" {
|
||||
t.Errorf("bad confidence: %q", f.Confidence)
|
||||
}
|
||||
if strings.Contains(f.Source, "/projects/42/-/blob/main/app/config.env") {
|
||||
sawP42 = true
|
||||
}
|
||||
if strings.Contains(f.Source, "/projects/99/-/blob/master/docs/README.md") {
|
||||
sawP99 = true
|
||||
}
|
||||
}
|
||||
if !sawP42 || !sawP99 {
|
||||
t.Fatalf("expected both project URLs in Source fields: p42=%v p99=%v", sawP42, sawP99)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabSource_Unauthorized(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(401)
|
||||
_, _ = w.Write([]byte(`{"message":"401 Unauthorized"}`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &GitLabSource{
|
||||
Token: "bad",
|
||||
BaseURL: srv.URL,
|
||||
Registry: gitlabTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
out := make(chan recon.Finding, 4)
|
||||
err := s.Sweep(context.Background(), "", out)
|
||||
close(out)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error, got nil")
|
||||
}
|
||||
if !errors.Is(err, ErrUnauthorized) {
|
||||
t.Fatalf("expected ErrUnauthorized, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabSource_CtxCancellation(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-time.After(2 * time.Second):
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := &GitLabSource{
|
||||
Token: "glpat-x",
|
||||
BaseURL: srv.URL,
|
||||
Registry: gitlabTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
|
||||
defer cancel()
|
||||
out := make(chan recon.Finding, 4)
|
||||
err := s.Sweep(ctx, "", out)
|
||||
close(out)
|
||||
if err == nil {
|
||||
t.Fatalf("expected ctx error, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestGitLabSource_InterfaceAssertion(t *testing.T) {
|
||||
var _ recon.ReconSource = (*GitLabSource)(nil)
|
||||
}
|
||||
|
||||
func gitlabDrain(ch <-chan recon.Finding) []recon.Finding {
|
||||
var out []recon.Finding
|
||||
for f := range ch {
|
||||
out = append(out, f)
|
||||
}
|
||||
return out
|
||||
}
|
||||
181
pkg/recon/sources/huggingface.go
Normal file
181
pkg/recon/sources/huggingface.go
Normal file
@@ -0,0 +1,181 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// defaultHuggingFaceBaseURL is the public HF Hub API root.
|
||||
const defaultHuggingFaceBaseURL = "https://huggingface.co"
|
||||
|
||||
// HuggingFaceConfig configures a HuggingFaceSource.
|
||||
type HuggingFaceConfig struct {
|
||||
// Token is the Hugging Face access token. Optional — anonymous requests
|
||||
// are accepted but rate-limited more aggressively.
|
||||
Token string
|
||||
// BaseURL overrides the API root for tests. Defaults to
|
||||
// https://huggingface.co when empty.
|
||||
BaseURL string
|
||||
// Registry drives keyword generation via BuildQueries.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared per-source limiter registry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
}
|
||||
|
||||
// HuggingFaceSource implements recon.ReconSource against the Hugging Face Hub
|
||||
// API, sweeping both Spaces and model repositories for provider keywords.
|
||||
//
|
||||
// RECON-CODE-08: token optional; when empty the source still runs but applies
|
||||
// a slower RateLimit to stay within anonymous quotas.
|
||||
type HuggingFaceSource struct {
|
||||
Token string
|
||||
BaseURL string
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
|
||||
client *Client
|
||||
}
|
||||
|
||||
// NewHuggingFaceSource constructs a HuggingFaceSource with sensible defaults.
|
||||
func NewHuggingFaceSource(cfg HuggingFaceConfig) *HuggingFaceSource {
|
||||
base := cfg.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
return &HuggingFaceSource{
|
||||
Token: cfg.Token,
|
||||
BaseURL: base,
|
||||
Registry: cfg.Registry,
|
||||
Limiters: cfg.Limiters,
|
||||
client: NewClient(),
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *HuggingFaceSource) Name() string { return "huggingface" }
|
||||
|
||||
// RateLimit returns the per-source token bucket rate. Authenticated requests
|
||||
// get ~1000/hour (one every 3.6s); unauthenticated requests are throttled to
|
||||
// one every 10 seconds to stay conservative against the public quota.
|
||||
func (s *HuggingFaceSource) RateLimit() rate.Limit {
|
||||
if s.Token != "" {
|
||||
return rate.Every(3600 * time.Millisecond)
|
||||
}
|
||||
return rate.Every(10 * time.Second)
|
||||
}
|
||||
|
||||
// Burst returns the limiter burst capacity.
|
||||
func (s *HuggingFaceSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots reports whether this source should honor robots.txt.
|
||||
// The Hub API is a JSON endpoint, so robots.txt does not apply.
|
||||
func (s *HuggingFaceSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether this source should run. HuggingFace runs even
|
||||
// without a token — anonymous requests are permitted at a lower rate limit.
|
||||
func (s *HuggingFaceSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// hfItem is the minimal shape returned by /api/spaces and /api/models list
|
||||
// endpoints. Both expose an `id` of the form "owner/name".
|
||||
type hfItem struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
|
||||
// Sweep iterates provider keywords and queries both the Spaces and Models
|
||||
// search endpoints, emitting one Finding per result.
|
||||
func (s *HuggingFaceSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.client == nil {
|
||||
s.client = NewClient()
|
||||
}
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = defaultHuggingFaceBaseURL
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, s.Name())
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
endpoints := []struct {
|
||||
path string
|
||||
urlPrefix string // prefix applied to item.ID to form Finding.Source
|
||||
}{
|
||||
{"/api/spaces", "https://huggingface.co/spaces/"},
|
||||
{"/api/models", "https://huggingface.co/"},
|
||||
}
|
||||
|
||||
for _, q := range queries {
|
||||
for _, ep := range endpoints {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.sweepEndpoint(ctx, base, ep.path, ep.urlPrefix, q, out); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *HuggingFaceSource) sweepEndpoint(
|
||||
ctx context.Context,
|
||||
base, path, urlPrefix, query string,
|
||||
out chan<- recon.Finding,
|
||||
) error {
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
u := fmt.Sprintf("%s%s?search=%s&limit=50", base, path, url.QueryEscape(query))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface: build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Accept", "application/json")
|
||||
if s.Token != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+s.Token)
|
||||
}
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("huggingface %s: %w", path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var items []hfItem
|
||||
if err := json.NewDecoder(resp.Body).Decode(&items); err != nil {
|
||||
return fmt.Errorf("huggingface %s: decode: %w", path, err)
|
||||
}
|
||||
|
||||
for _, item := range items {
|
||||
if item.ID == "" {
|
||||
continue
|
||||
}
|
||||
finding := recon.Finding{
|
||||
Source: urlPrefix + item.ID,
|
||||
SourceType: "recon:huggingface",
|
||||
DetectedAt: time.Now().UTC(),
|
||||
}
|
||||
select {
|
||||
case out <- finding:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Compile-time assertion that HuggingFaceSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*HuggingFaceSource)(nil)
|
||||
204
pkg/recon/sources/huggingface_test.go
Normal file
204
pkg/recon/sources/huggingface_test.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// hfTestRegistry builds a minimal registry with two keywords so tests assert
|
||||
// an exact Finding count (2 endpoints × 2 keywords × 1 result = 4).
|
||||
func hfTestRegistry(t *testing.T) *providers.Registry {
|
||||
t.Helper()
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "OpenAI", Keywords: []string{"sk-proj"}},
|
||||
{Name: "Anthropic", Keywords: []string{"sk-ant"}},
|
||||
})
|
||||
}
|
||||
|
||||
func hfTestServer(t *testing.T, spacesHits, modelsHits *int32, authSeen *string) *httptest.Server {
|
||||
t.Helper()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/api/spaces", func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(spacesHits, 1)
|
||||
if authSeen != nil {
|
||||
*authSeen = r.Header.Get("Authorization")
|
||||
}
|
||||
q := r.URL.Query().Get("search")
|
||||
payload := []map[string]string{
|
||||
{"id": fmt.Sprintf("acme/space-%s", q)},
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
})
|
||||
mux.HandleFunc("/api/models", func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(modelsHits, 1)
|
||||
q := r.URL.Query().Get("search")
|
||||
payload := []map[string]string{
|
||||
{"id": fmt.Sprintf("acme/model-%s", q)},
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(payload)
|
||||
})
|
||||
return httptest.NewServer(mux)
|
||||
}
|
||||
|
||||
func TestHuggingFaceEnabledAlwaysTrue(t *testing.T) {
|
||||
if !(&HuggingFaceSource{}).Enabled(recon.Config{}) {
|
||||
t.Fatal("HuggingFace should be enabled even without token")
|
||||
}
|
||||
if !(&HuggingFaceSource{Token: "hf_xxx"}).Enabled(recon.Config{}) {
|
||||
t.Fatal("HuggingFace should be enabled with token")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceSweepHitsBothEndpoints(t *testing.T) {
|
||||
var spacesHits, modelsHits int32
|
||||
ts := hfTestServer(t, &spacesHits, &modelsHits, nil)
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
Token: "hf_test",
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil, // bypass rate limiter for tests
|
||||
})
|
||||
|
||||
out := make(chan recon.Finding, 16)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
findings := make([]recon.Finding, 0)
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
|
||||
if len(findings) != 4 {
|
||||
t.Fatalf("expected 4 findings, got %d", len(findings))
|
||||
}
|
||||
if atomic.LoadInt32(&spacesHits) != 2 {
|
||||
t.Errorf("expected 2 /api/spaces hits, got %d", spacesHits)
|
||||
}
|
||||
if atomic.LoadInt32(&modelsHits) != 2 {
|
||||
t.Errorf("expected 2 /api/models hits, got %d", modelsHits)
|
||||
}
|
||||
|
||||
var sawSpace, sawModel bool
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:huggingface" {
|
||||
t.Errorf("wrong SourceType: %q", f.SourceType)
|
||||
}
|
||||
switch {
|
||||
case strings.HasPrefix(f.Source, "https://huggingface.co/spaces/"):
|
||||
sawSpace = true
|
||||
case strings.HasPrefix(f.Source, "https://huggingface.co/"):
|
||||
sawModel = true
|
||||
default:
|
||||
t.Errorf("unexpected Source URL: %q", f.Source)
|
||||
}
|
||||
}
|
||||
if !sawSpace || !sawModel {
|
||||
t.Errorf("expected both space and model URLs; space=%v model=%v", sawSpace, sawModel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceAuthorizationHeader(t *testing.T) {
|
||||
var authSeen string
|
||||
var s, m int32
|
||||
ts := hfTestServer(t, &s, &m, &authSeen)
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
Token: "hf_secret",
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
out := make(chan recon.Finding, 16)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep: %v", err)
|
||||
}
|
||||
close(out)
|
||||
for range out {
|
||||
}
|
||||
if authSeen != "Bearer hf_secret" {
|
||||
t.Errorf("expected 'Bearer hf_secret', got %q", authSeen)
|
||||
}
|
||||
|
||||
// Without token
|
||||
authSeen = ""
|
||||
var s2, m2 int32
|
||||
ts2 := hfTestServer(t, &s2, &m2, &authSeen)
|
||||
defer ts2.Close()
|
||||
src2 := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
BaseURL: ts2.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
out2 := make(chan recon.Finding, 16)
|
||||
if err := src2.Sweep(ctx, "", out2); err != nil {
|
||||
t.Fatalf("Sweep unauth: %v", err)
|
||||
}
|
||||
close(out2)
|
||||
for range out2 {
|
||||
}
|
||||
if authSeen != "" {
|
||||
t.Errorf("expected no Authorization header when token empty, got %q", authSeen)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceContextCancellation(t *testing.T) {
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
select {
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-time.After(2 * time.Second):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
reg := hfTestRegistry(t)
|
||||
src := NewHuggingFaceSource(HuggingFaceConfig{
|
||||
BaseURL: ts.URL,
|
||||
Registry: reg,
|
||||
Limiters: nil,
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer cancel()
|
||||
out := make(chan recon.Finding, 16)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected error on cancelled context")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHuggingFaceRateLimitTokenMode(t *testing.T) {
|
||||
withTok := &HuggingFaceSource{Token: "hf_xxx"}
|
||||
noTok := &HuggingFaceSource{}
|
||||
if withTok.RateLimit() == noTok.RateLimit() {
|
||||
t.Fatal("rate limit should differ based on token presence")
|
||||
}
|
||||
if withTok.RateLimit() < noTok.RateLimit() {
|
||||
t.Fatalf("authenticated rate (%v) should be faster (larger) than unauth (%v)",
|
||||
withTok.RateLimit(), noTok.RateLimit())
|
||||
}
|
||||
}
|
||||
240
pkg/recon/sources/integration_test.go
Normal file
240
pkg/recon/sources/integration_test.go
Normal file
@@ -0,0 +1,240 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest
|
||||
// server that serves canned fixtures for every Phase 10 code-hosting source,
|
||||
// registers the sources (with BaseURL overrides pointing at the test server)
|
||||
// onto a fresh recon.Engine, runs SweepAll, and asserts at least one Finding
|
||||
// was emitted per SourceType across all ten sources.
|
||||
//
|
||||
// RegisterAll cannot be used directly because it wires production URLs; the
|
||||
// test exercises the same code paths by constructing each source identically
|
||||
// to RegisterAll but with BaseURL/Platforms overrides.
|
||||
func TestIntegration_AllSources_SweepAll(t *testing.T) {
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// ---- GitHub /search/code ----
|
||||
mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(ghSearchResponse{
|
||||
Items: []ghCodeItem{
|
||||
{HTMLURL: "https://github.com/alice/leak/blob/main/.env"},
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
// ---- GitLab /api/v4/search ----
|
||||
mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`))
|
||||
})
|
||||
|
||||
// ---- Bitbucket /2.0/workspaces/<ws>/search/code ----
|
||||
mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`))
|
||||
})
|
||||
|
||||
// ---- Gist /gists/public + raw content ----
|
||||
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r))
|
||||
_, _ = w.Write([]byte(body))
|
||||
})
|
||||
mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("api_key = sk-proj-ABCDEF"))
|
||||
})
|
||||
|
||||
// ---- Codeberg /api/v1/repos/search ----
|
||||
mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`))
|
||||
})
|
||||
|
||||
// ---- HuggingFace /api/spaces + /api/models ----
|
||||
hfHandler := func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`))
|
||||
}
|
||||
mux.HandleFunc("/api/spaces", hfHandler)
|
||||
mux.HandleFunc("/api/models", hfHandler)
|
||||
|
||||
// ---- Replit /search?q=...&type=repls (HTML) ----
|
||||
// ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ----
|
||||
// Both hit the same /search path; distinguish on query params.
|
||||
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
switch r.URL.Query().Get("type") {
|
||||
case "repls":
|
||||
_, _ = w.Write([]byte(`<html><body>
|
||||
<a href="/@alice/leaky-repl">hit</a>
|
||||
<a href="/other/path">skip</a>
|
||||
</body></html>`))
|
||||
case "sandboxes":
|
||||
_, _ = w.Write([]byte(`<html><body>
|
||||
<a href="/s/leaky-sandbox">hit</a>
|
||||
<a href="/other">skip</a>
|
||||
</body></html>`))
|
||||
default:
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}
|
||||
})
|
||||
|
||||
// ---- SandboxesSource sub-platforms ----
|
||||
mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><body><a href="/alice/pen/AbCd1234">hit</a></body></html>`))
|
||||
})
|
||||
mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`))
|
||||
})
|
||||
|
||||
// ---- Kaggle /api/v1/kernels/list ----
|
||||
mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`))
|
||||
})
|
||||
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
|
||||
reg := providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
lim := recon.NewLimiterRegistry()
|
||||
|
||||
eng := recon.NewEngine()
|
||||
|
||||
// GitHub — token + BaseURL override. Use the real constructor so `client`
|
||||
// is initialized, then retarget BaseURL at the test server.
|
||||
ghs := NewGitHubSource("ghp-test", reg, lim)
|
||||
ghs.BaseURL = srv.URL
|
||||
eng.Register(ghs)
|
||||
// GitLab
|
||||
eng.Register(&GitLabSource{
|
||||
Token: "glpat-test",
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// Bitbucket
|
||||
eng.Register(&BitbucketSource{
|
||||
Token: "bb-test",
|
||||
Workspace: "kh-test",
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// Gist — uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
|
||||
eng.Register(&GistSource{
|
||||
Token: "ghp-test",
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// Codeberg
|
||||
eng.Register(&CodebergSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// HuggingFace
|
||||
eng.Register(NewHuggingFaceSource(HuggingFaceConfig{
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
}))
|
||||
// Replit
|
||||
eng.Register(&ReplitSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// CodeSandbox
|
||||
eng.Register(&CodeSandboxSource{
|
||||
BaseURL: srv.URL,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
// Sandboxes — inject test sub-platforms that hit srv.URL.
|
||||
eng.Register(&SandboxesSource{
|
||||
Platforms: []subPlatform{
|
||||
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
|
||||
{Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"},
|
||||
},
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
Client: NewClient(),
|
||||
BaseURL: srv.URL,
|
||||
})
|
||||
// Kaggle
|
||||
eng.Register(&KaggleSource{
|
||||
User: "kh-user",
|
||||
Key: "kh-key",
|
||||
BaseURL: srv.URL,
|
||||
WebBaseURL: "https://www.kaggle.com",
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
client: NewClient(),
|
||||
})
|
||||
|
||||
// Sanity: all 10 sources registered.
|
||||
if n := len(eng.List()); n != 10 {
|
||||
t.Fatalf("expected 10 sources on engine, got %d: %v", n, eng.List())
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"})
|
||||
if err != nil {
|
||||
t.Fatalf("SweepAll returned error: %v", err)
|
||||
}
|
||||
|
||||
// Group findings by SourceType and assert every expected bucket is present.
|
||||
byType := make(map[string]int)
|
||||
for _, f := range findings {
|
||||
byType[f.SourceType]++
|
||||
}
|
||||
|
||||
wantTypes := []string{
|
||||
"recon:github",
|
||||
"recon:gitlab",
|
||||
"recon:bitbucket",
|
||||
"recon:gist",
|
||||
"recon:codeberg",
|
||||
"recon:huggingface",
|
||||
"recon:replit",
|
||||
"recon:codesandbox",
|
||||
"recon:sandboxes",
|
||||
"recon:kaggle",
|
||||
}
|
||||
for _, st := range wantTypes {
|
||||
if byType[st] == 0 {
|
||||
t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// baseFromReq reconstructs the scheme+host of the inbound request so handlers
|
||||
// can build absolute raw URLs pointing back at the same httptest server.
|
||||
func baseFromReq(r *http.Request) string {
|
||||
scheme := "http"
|
||||
if r.TLS != nil {
|
||||
scheme = "https"
|
||||
}
|
||||
return scheme + "://" + r.Host
|
||||
}
|
||||
149
pkg/recon/sources/kaggle.go
Normal file
149
pkg/recon/sources/kaggle.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// KaggleSource implements recon.ReconSource against the Kaggle public REST API.
|
||||
//
|
||||
// RECON-CODE-09: queries GET /api/v1/kernels/list?search=<q>&pageSize=50 with
|
||||
// HTTP Basic authentication (username + API key from kaggle.json). Emits
|
||||
// engine.Finding entries for every returned kernel ref, with Source pointing
|
||||
// to https://www.kaggle.com/code/<ref>.
|
||||
type KaggleSource struct {
|
||||
User string
|
||||
Key string
|
||||
BaseURL string // API base, default https://www.kaggle.com
|
||||
WebBaseURL string // Web UI base for Finding URLs, default https://www.kaggle.com
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
client *Client
|
||||
}
|
||||
|
||||
// NewKaggleSource constructs a KaggleSource with default URLs and a shared Client.
|
||||
func NewKaggleSource(user, key string, reg *providers.Registry, lim *recon.LimiterRegistry) *KaggleSource {
|
||||
return &KaggleSource{
|
||||
User: user,
|
||||
Key: key,
|
||||
BaseURL: "https://www.kaggle.com",
|
||||
WebBaseURL: "https://www.kaggle.com",
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
client: NewClient(),
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the stable source identifier.
|
||||
func (s *KaggleSource) Name() string { return "kaggle" }
|
||||
|
||||
// RateLimit enforces Kaggle's documented 60 requests/minute ceiling.
|
||||
func (s *KaggleSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) }
|
||||
|
||||
// Burst returns the per-source burst capacity.
|
||||
func (s *KaggleSource) Burst() int { return 1 }
|
||||
|
||||
// RespectsRobots is false — Kaggle exposes a public REST API, not scraped HTML.
|
||||
func (s *KaggleSource) RespectsRobots() bool { return false }
|
||||
|
||||
// Enabled reports whether both User and Key credentials are present.
|
||||
func (s *KaggleSource) Enabled(_ recon.Config) bool {
|
||||
return s.User != "" && s.Key != ""
|
||||
}
|
||||
|
||||
// Sweep iterates provider keyword queries, calling the Kaggle kernels/list API
|
||||
// with Basic auth for each. For every returned kernel ref, a Finding is emitted
|
||||
// on out with SourceType "recon:kaggle" and Source pointing at the web UI URL.
|
||||
//
|
||||
// Missing credentials short-circuit to nil without issuing any HTTP calls.
|
||||
func (s *KaggleSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
if s.User == "" || s.Key == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
base := s.BaseURL
|
||||
if base == "" {
|
||||
base = "https://www.kaggle.com"
|
||||
}
|
||||
web := s.WebBaseURL
|
||||
if web == "" {
|
||||
web = "https://www.kaggle.com"
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "kaggle")
|
||||
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
endpoint := fmt.Sprintf("%s/api/v1/kernels/list?search=%s&pageSize=50", base, url.QueryEscape(q))
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.SetBasicAuth(s.User, s.Key)
|
||||
req.Header.Set("Accept", "application/json")
|
||||
|
||||
resp, err := s.client.Do(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrUnauthorized) {
|
||||
return err
|
||||
}
|
||||
// Sources downgrade on transient errors rather than aborting
|
||||
// the whole sweep — skip to the next query.
|
||||
continue
|
||||
}
|
||||
|
||||
var kernels []kaggleKernel
|
||||
decodeErr := json.NewDecoder(resp.Body).Decode(&kernels)
|
||||
resp.Body.Close()
|
||||
if decodeErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, k := range kernels {
|
||||
if k.Ref == "" {
|
||||
continue
|
||||
}
|
||||
f := recon.Finding{
|
||||
Confidence: "low",
|
||||
Source: web + "/code/" + k.Ref,
|
||||
SourceType: "recon:kaggle",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
select {
|
||||
case out <- f:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// kaggleKernel mirrors the subset of fields returned by /api/v1/kernels/list
|
||||
// that this source consumes. Additional fields (title, author, language) are
|
||||
// ignored on purpose to keep the decoder tolerant of future API changes.
|
||||
type kaggleKernel struct {
|
||||
Ref string `json:"ref"`
|
||||
}
|
||||
|
||||
// Compile-time assertion that KaggleSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*KaggleSource)(nil)
|
||||
204
pkg/recon/sources/kaggle_test.go
Normal file
204
pkg/recon/sources/kaggle_test.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func kaggleTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
func newKaggleSource(t *testing.T, user, key, baseURL string) *KaggleSource {
|
||||
t.Helper()
|
||||
s := NewKaggleSource(user, key, kaggleTestRegistry(), recon.NewLimiterRegistry())
|
||||
s.BaseURL = baseURL
|
||||
s.WebBaseURL = "https://www.kaggle.com"
|
||||
return s
|
||||
}
|
||||
|
||||
func TestKaggle_Enabled(t *testing.T) {
|
||||
reg := kaggleTestRegistry()
|
||||
lim := recon.NewLimiterRegistry()
|
||||
|
||||
cases := []struct {
|
||||
user, key string
|
||||
want bool
|
||||
}{
|
||||
{"", "", false},
|
||||
{"user", "", false},
|
||||
{"", "key", false},
|
||||
{"user", "key", true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
s := NewKaggleSource(c.user, c.key, reg, lim)
|
||||
if got := s.Enabled(recon.Config{}); got != c.want {
|
||||
t.Errorf("Enabled(user=%q,key=%q) = %v, want %v", c.user, c.key, got, c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKaggle_Sweep_BasicAuthAndFindings(t *testing.T) {
|
||||
var gotAuth string
|
||||
var gotQuery string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
gotQuery = r.URL.Query().Get("search")
|
||||
if r.URL.Query().Get("pageSize") != "50" {
|
||||
t.Errorf("expected pageSize=50, got %q", r.URL.Query().Get("pageSize"))
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte(`[{"ref":"alice/notebook-one","title":"one"},{"ref":"bob/notebook-two","title":"two"}]`))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := newKaggleSource(t, "testuser", "testkey", srv.URL)
|
||||
|
||||
out := make(chan recon.Finding, 8)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := s.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep returned error: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
if !strings.HasPrefix(gotAuth, "Basic ") {
|
||||
t.Fatalf("expected Basic auth header, got %q", gotAuth)
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(gotAuth, "Basic "))
|
||||
if err != nil {
|
||||
t.Fatalf("failed to decode Basic auth: %v", err)
|
||||
}
|
||||
if string(decoded) != "testuser:testkey" {
|
||||
t.Fatalf("expected credentials testuser:testkey, got %q", string(decoded))
|
||||
}
|
||||
|
||||
if gotQuery != "sk-proj-" {
|
||||
t.Errorf("expected search=sk-proj-, got %q", gotQuery)
|
||||
}
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
if len(findings) != 2 {
|
||||
t.Fatalf("expected 2 findings, got %d", len(findings))
|
||||
}
|
||||
wantSources := map[string]bool{
|
||||
"https://www.kaggle.com/code/alice/notebook-one": false,
|
||||
"https://www.kaggle.com/code/bob/notebook-two": false,
|
||||
}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:kaggle" {
|
||||
t.Errorf("expected SourceType recon:kaggle, got %q", f.SourceType)
|
||||
}
|
||||
if _, ok := wantSources[f.Source]; !ok {
|
||||
t.Errorf("unexpected Source: %q", f.Source)
|
||||
}
|
||||
wantSources[f.Source] = true
|
||||
}
|
||||
for src, seen := range wantSources {
|
||||
if !seen {
|
||||
t.Errorf("missing expected finding source: %s", src)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKaggle_Sweep_MissingCredentials_NoHTTP(t *testing.T) {
|
||||
var calls int32
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt32(&calls, 1)
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := newKaggleSource(t, "testuser", "", srv.URL)
|
||||
out := make(chan recon.Finding, 1)
|
||||
if err := s.Sweep(context.Background(), "", out); err != nil {
|
||||
t.Fatalf("expected nil error for missing key, got %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
s2 := newKaggleSource(t, "", "testkey", srv.URL)
|
||||
out2 := make(chan recon.Finding, 1)
|
||||
if err := s2.Sweep(context.Background(), "", out2); err != nil {
|
||||
t.Fatalf("expected nil error for missing user, got %v", err)
|
||||
}
|
||||
close(out2)
|
||||
|
||||
if n := atomic.LoadInt32(&calls); n != 0 {
|
||||
t.Fatalf("expected 0 HTTP calls when credentials missing, got %d", n)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKaggle_Sweep_Unauthorized(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(401)
|
||||
_, _ = w.Write([]byte("bad creds"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := newKaggleSource(t, "testuser", "testkey", srv.URL)
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := s.Sweep(context.Background(), "", out)
|
||||
if err == nil {
|
||||
t.Fatal("expected error on 401")
|
||||
}
|
||||
if !errors.Is(err, ErrUnauthorized) {
|
||||
t.Fatalf("expected ErrUnauthorized, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKaggle_Sweep_CtxCancellation(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(2 * time.Second)
|
||||
w.WriteHeader(200)
|
||||
_, _ = w.Write([]byte("[]"))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
s := newKaggleSource(t, "testuser", "testkey", srv.URL)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 1)
|
||||
err := s.Sweep(ctx, "", out)
|
||||
if err == nil {
|
||||
t.Fatal("expected error from cancelled context")
|
||||
}
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("expected context.Canceled, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKaggle_ReconSourceInterface(t *testing.T) {
|
||||
var _ recon.ReconSource = (*KaggleSource)(nil)
|
||||
s := NewKaggleSource("u", "k", nil, nil)
|
||||
if s.Name() != "kaggle" {
|
||||
t.Errorf("Name = %q, want kaggle", s.Name())
|
||||
}
|
||||
if s.Burst() != 1 {
|
||||
t.Errorf("Burst = %d, want 1", s.Burst())
|
||||
}
|
||||
if s.RespectsRobots() {
|
||||
t.Error("RespectsRobots should be false")
|
||||
}
|
||||
if s.RateLimit() <= 0 {
|
||||
t.Error("RateLimit should be > 0")
|
||||
}
|
||||
}
|
||||
@@ -6,27 +6,93 @@ import (
|
||||
)
|
||||
|
||||
// SourcesConfig carries per-source credentials and shared dependencies read
|
||||
// from viper/env by cmd/recon.go. Plan 10-09 fleshes this out; for now it is a
|
||||
// placeholder struct so Wave 2 plans can depend on its shape.
|
||||
// from viper/env by cmd/recon.go and handed to RegisterAll.
|
||||
//
|
||||
// Fields are populated from environment variables (GITHUB_TOKEN, GITLAB_TOKEN,
|
||||
// ...) or viper config keys (recon.github.token, ...). Empty values are
|
||||
// permitted: the corresponding source is still registered on the engine, but
|
||||
// its Enabled() reports false so SweepAll skips it cleanly.
|
||||
type SourcesConfig struct {
|
||||
// GitHub / Gist share the same token.
|
||||
GitHubToken string
|
||||
// GitLab personal access token.
|
||||
GitLabToken string
|
||||
// Bitbucket Cloud app password or OAuth token + required workspace slug.
|
||||
BitbucketToken string
|
||||
BitbucketWorkspace string
|
||||
// Codeberg (Gitea) token — optional, raises rate limit when present.
|
||||
CodebergToken string
|
||||
// HuggingFace Hub token — optional, raises rate limit when present.
|
||||
HuggingFaceToken string
|
||||
// Kaggle Basic-auth username + API key.
|
||||
KaggleUser string
|
||||
KaggleKey string
|
||||
|
||||
// Registry drives query generation for every source via BuildQueries.
|
||||
Registry *providers.Registry
|
||||
// Limiters is the shared per-source rate-limiter registry.
|
||||
Limiters *recon.LimiterRegistry
|
||||
}
|
||||
|
||||
// RegisterAll registers every Phase 10 code-hosting source on engine.
|
||||
// Wave 2 plans append their source constructors here via additional
|
||||
// registerXxx helpers in this file. Plan 10-09 writes the final list.
|
||||
//
|
||||
// All ten sources are registered unconditionally so that cmd/recon.go can
|
||||
// surface the full catalog via `keyhunter recon list` regardless of which
|
||||
// credentials are configured. Sources without required credentials return
|
||||
// Enabled()==false so SweepAll skips them without erroring.
|
||||
//
|
||||
// A nil engine is treated as a no-op (not an error) — callers in broken init
|
||||
// paths shouldn't panic.
|
||||
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
|
||||
if engine == nil {
|
||||
return
|
||||
}
|
||||
_ = cfg // wired up in Wave 2 + Plan 10-09
|
||||
// Populated by Plan 10-09 (after Wave 2 lands individual source files).
|
||||
reg := cfg.Registry
|
||||
lim := cfg.Limiters
|
||||
|
||||
// API sources with constructors.
|
||||
engine.Register(NewGitHubSource(cfg.GitHubToken, reg, lim))
|
||||
engine.Register(NewKaggleSource(cfg.KaggleUser, cfg.KaggleKey, reg, lim))
|
||||
engine.Register(NewHuggingFaceSource(HuggingFaceConfig{
|
||||
Token: cfg.HuggingFaceToken,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
}))
|
||||
|
||||
// API sources exposed as struct literals (no New* constructor in Wave 2).
|
||||
engine.Register(&GitLabSource{
|
||||
Token: cfg.GitLabToken,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
engine.Register(&BitbucketSource{
|
||||
Token: cfg.BitbucketToken,
|
||||
Workspace: cfg.BitbucketWorkspace,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
engine.Register(&GistSource{
|
||||
Token: cfg.GitHubToken,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
engine.Register(&CodebergSource{
|
||||
Token: cfg.CodebergToken,
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
|
||||
// Scraping sources (credentialless).
|
||||
engine.Register(&ReplitSource{
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
engine.Register(&CodeSandboxSource{
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
engine.Register(&SandboxesSource{
|
||||
Registry: reg,
|
||||
Limiters: lim,
|
||||
})
|
||||
}
|
||||
|
||||
65
pkg/recon/sources/register_test.go
Normal file
65
pkg/recon/sources/register_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// registerTestRegistry builds a minimal registry with one synthetic provider so
|
||||
// BuildQueries inside individual sources does not panic.
|
||||
func registerTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
// TestRegisterAll_WiresAllTenSources asserts that RegisterAll registers every
|
||||
// Phase 10 code-hosting source by its stable name on a fresh engine.
|
||||
func TestRegisterAll_WiresAllTenSources(t *testing.T) {
|
||||
eng := recon.NewEngine()
|
||||
cfg := SourcesConfig{
|
||||
Registry: registerTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
}
|
||||
RegisterAll(eng, cfg)
|
||||
|
||||
got := eng.List()
|
||||
want := []string{
|
||||
"bitbucket",
|
||||
"codeberg",
|
||||
"codesandbox",
|
||||
"gist",
|
||||
"github",
|
||||
"gitlab",
|
||||
"huggingface",
|
||||
"kaggle",
|
||||
"replit",
|
||||
"sandboxes",
|
||||
}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("RegisterAll names mismatch\n got: %v\nwant: %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRegisterAll_MissingCredsStillRegistered asserts that sources whose
|
||||
// credentials are absent are still registered (so eng.List() reports them),
|
||||
// but their Enabled() returns false. This keeps the CLI surface uniform
|
||||
// regardless of which tokens are configured.
|
||||
func TestRegisterAll_MissingCredsStillRegistered(t *testing.T) {
|
||||
eng := recon.NewEngine()
|
||||
RegisterAll(eng, SourcesConfig{
|
||||
Registry: registerTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
})
|
||||
|
||||
if n := len(eng.List()); n != 10 {
|
||||
t.Fatalf("expected 10 sources registered, got %d: %v", n, eng.List())
|
||||
}
|
||||
|
||||
// SweepAll with an empty config should filter out cred-gated sources
|
||||
// (github, gitlab, bitbucket, gist, kaggle) and still run the credless
|
||||
// ones. We only check List() here; the integration test covers Sweep.
|
||||
}
|
||||
Reference in New Issue
Block a user