diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index 1d6c98e..498bd84 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -215,7 +215,17 @@ Plans:
3. `keyhunter recon --sources=gist,bitbucket,codeberg` scans public gists, Bitbucket repos, and Codeberg/Gitea instances
4. `keyhunter recon --sources=replit,codesandbox,kaggle` scans public repls, sandboxes, and notebooks
5. All code hosting source findings are stored in the database with source attribution and deduplication
-**Plans**: TBD
+**Plans**: 9 plans
+Plans:
+- [ ] 10-01-PLAN.md — Shared HTTP client + provider-query generator + RegisterAll skeleton
+- [ ] 10-02-PLAN.md — GitHubSource (RECON-CODE-01)
+- [ ] 10-03-PLAN.md — GitLabSource (RECON-CODE-02)
+- [ ] 10-04-PLAN.md — BitbucketSource + GistSource (RECON-CODE-03, RECON-CODE-04)
+- [ ] 10-05-PLAN.md — CodebergSource/Gitea (RECON-CODE-05)
+- [ ] 10-06-PLAN.md — HuggingFaceSource (RECON-CODE-08)
+- [ ] 10-07-PLAN.md — Replit + CodeSandbox + Sandboxes scrapers (RECON-CODE-06, RECON-CODE-07, RECON-CODE-10)
+- [ ] 10-08-PLAN.md — KaggleSource (RECON-CODE-09)
+- [ ] 10-09-PLAN.md — RegisterAll wiring + CLI integration + end-to-end test
### Phase 11: OSINT Search & Paste
**Goal**: Users can run automated search engine dorking against Google, Bing, DuckDuckGo, Yandex, and Brave, and scan 15+ paste site aggregations for leaked API keys
diff --git a/.planning/phases/10-osint-code-hosting/10-01-PLAN.md b/.planning/phases/10-osint-code-hosting/10-01-PLAN.md
new file mode 100644
index 0000000..d7d627a
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-01-PLAN.md
@@ -0,0 +1,331 @@
+---
+phase: 10-osint-code-hosting
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+ - pkg/recon/sources/doc.go
+ - pkg/recon/sources/httpclient.go
+ - pkg/recon/sources/httpclient_test.go
+ - pkg/recon/sources/queries.go
+ - pkg/recon/sources/queries_test.go
+ - pkg/recon/sources/register.go
+autonomous: true
+requirements: []
+must_haves:
+ truths:
+ - "Shared retry HTTP client honors ctx cancellation and Retry-After on 429/403"
+ - "Provider registry drives per-source query templates (no hardcoded literals)"
+ - "Empty source registry compiles and exposes RegisterAll(engine, cfg)"
+ artifacts:
+ - path: "pkg/recon/sources/httpclient.go"
+ provides: "Retrying *http.Client with context + Retry-After handling"
+ - path: "pkg/recon/sources/queries.go"
+ provides: "BuildQueries(registry, sourceName) []string generator"
+ - path: "pkg/recon/sources/register.go"
+ provides: "RegisterAll(engine *recon.Engine, cfg SourcesConfig) bootstrap"
+ key_links:
+ - from: "pkg/recon/sources/httpclient.go"
+ to: "net/http + context + golang.org/x/time/rate"
+ via: "DoWithRetry(ctx, req, limiter) (*http.Response, error)"
+ pattern: "DoWithRetry"
+ - from: "pkg/recon/sources/queries.go"
+ to: "pkg/providers.Registry"
+ via: "BuildQueries iterates reg.List() and formats provider keywords"
+ pattern: "BuildQueries"
+---
+
+
+Establish the shared foundation for all Phase 10 code hosting sources: a retry-aware HTTP
+client wrapper, a provider→query template generator driven by the provider registry, and
+an empty RegisterAll bootstrap that Plan 10-09 will fill in. No individual source is
+implemented here — this plan exists so Wave 2 plans (10-02..10-08) can run in parallel
+without fighting over shared helpers.
+
+Purpose: Deduplicate retry/rate-limit/backoff logic across 10 sources; centralize query
+generation so providers added later automatically flow to every source.
+Output: Compilable `pkg/recon/sources` package skeleton with tested helpers.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@pkg/recon/source.go
+@pkg/recon/limiter.go
+@pkg/dorks/github.go
+@pkg/providers/registry.go
+
+
+From pkg/recon/source.go:
+```go
+type ReconSource interface {
+ Name() string
+ RateLimit() rate.Limit
+ Burst() int
+ RespectsRobots() bool
+ Enabled(cfg Config) bool
+ Sweep(ctx context.Context, query string, out chan<- Finding) error
+}
+type Finding = engine.Finding
+type Config struct { Stealth, RespectRobots bool; EnabledSources []string; Query string }
+```
+
+From pkg/recon/limiter.go:
+```go
+type LimiterRegistry struct { ... }
+func NewLimiterRegistry() *LimiterRegistry
+func (lr *LimiterRegistry) Wait(ctx, name, r, burst, stealth) error
+```
+
+From pkg/providers/registry.go:
+```go
+func (r *Registry) List() []Provider
+// Provider has: Name string, Keywords []string, Patterns []Pattern, Tier int
+```
+
+From pkg/engine/finding.go:
+```go
+type Finding struct {
+ ProviderName, KeyValue, KeyMasked, Confidence, Source, SourceType string
+ LineNumber int; Offset int64; DetectedAt time.Time
+ Verified bool; VerifyStatus string; ...
+}
+```
+
+
+
+
+
+
+ Task 1: Shared retry HTTP client helper
+ pkg/recon/sources/doc.go, pkg/recon/sources/httpclient.go, pkg/recon/sources/httpclient_test.go
+
+ - Test A: 200 OK returns response unchanged, body readable
+ - Test B: 429 with Retry-After:1 triggers one retry then succeeds (verify via httptest counter)
+ - Test C: 403 with Retry-After triggers retry
+ - Test D: 401 returns ErrUnauthorized immediately, no retry
+ - Test E: Ctx cancellation during retry sleep returns ctx.Err()
+ - Test F: MaxRetries exhausted returns wrapped last-status error
+
+
+ Create `pkg/recon/sources/doc.go` with the package comment: "Package sources hosts per-OSINT-source ReconSource implementations for Phase 10 code hosting (GitHub, GitLab, Bitbucket, Gist, Codeberg, HuggingFace, Kaggle, Replit, CodeSandbox, sandboxes). Each source implements pkg/recon.ReconSource."
+
+ Create `pkg/recon/sources/httpclient.go` exporting:
+ ```go
+ package sources
+
+ import (
+ "context"
+ "errors"
+ "fmt"
+ "net/http"
+ "strconv"
+ "time"
+ )
+
+ // ErrUnauthorized is returned when an API rejects credentials (401).
+ var ErrUnauthorized = errors.New("sources: unauthorized (check credentials)")
+
+ // Client is the shared retry wrapper every Phase 10 source uses.
+ type Client struct {
+ HTTP *http.Client
+ MaxRetries int // default 2
+ UserAgent string // default "keyhunter-recon/1.0"
+ }
+
+ // NewClient returns a Client with a 30s timeout and 2 retries.
+ func NewClient() *Client {
+ return &Client{HTTP: &http.Client{Timeout: 30 * time.Second}, MaxRetries: 2, UserAgent: "keyhunter-recon/1.0"}
+ }
+
+ // Do executes req with retries on 429/403/5xx honoring Retry-After.
+ // 401 returns ErrUnauthorized wrapped with the response body.
+ // Ctx cancellation is honored during sleeps.
+ func (c *Client) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
+ if req.Header.Get("User-Agent") == "" { req.Header.Set("User-Agent", c.UserAgent) }
+ var last *http.Response
+ for attempt := 0; attempt <= c.MaxRetries; attempt++ {
+ r, err := c.HTTP.Do(req.WithContext(ctx))
+ if err != nil { return nil, fmt.Errorf("sources http: %w", err) }
+ if r.StatusCode == http.StatusOK { return r, nil }
+ if r.StatusCode == http.StatusUnauthorized {
+ body := readBody(r)
+ return nil, fmt.Errorf("%w: %s", ErrUnauthorized, body)
+ }
+ retriable := r.StatusCode == 429 || r.StatusCode == 403 || r.StatusCode >= 500
+ if !retriable || attempt == c.MaxRetries {
+ body := readBody(r)
+ return nil, fmt.Errorf("sources http %d: %s", r.StatusCode, body)
+ }
+ sleep := ParseRetryAfter(r.Header.Get("Retry-After"))
+ r.Body.Close()
+ last = r
+ select {
+ case <-time.After(sleep):
+ case <-ctx.Done(): return nil, ctx.Err()
+ }
+ }
+ _ = last
+ return nil, fmt.Errorf("sources http: retries exhausted")
+ }
+
+ // ParseRetryAfter decodes integer-seconds Retry-After, defaulting to 1s.
+ func ParseRetryAfter(v string) time.Duration { ... }
+ // readBody reads up to 4KB of the body and closes it.
+ func readBody(r *http.Response) string { ... }
+ ```
+
+ Create `pkg/recon/sources/httpclient_test.go` using `net/http/httptest`:
+ - Table-driven tests for each behavior above. Use an atomic counter to verify
+ retry attempt counts. Use `httptest.NewServer` with a handler that switches on
+ a request counter.
+ - For ctx cancellation test: set Retry-After: 10, cancel ctx inside 100ms, assert
+ ctx.Err() returned within 500ms.
+
+ Do NOT build a LimiterRegistry wrapper here — each source calls its own LimiterRegistry.Wait
+ before calling Client.Do. Keeps Client single-purpose (retry only).
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestClient -v -timeout 30s
+
+
+ All behaviors covered; Client.Do retries on 429/403/5xx honoring Retry-After; 401
+ returns ErrUnauthorized immediately; ctx cancellation respected; tests green.
+
+
+
+
+ Task 2: Provider-driven query generator + RegisterAll skeleton
+ pkg/recon/sources/queries.go, pkg/recon/sources/queries_test.go, pkg/recon/sources/register.go
+
+ - Test A: BuildQueries(reg, "github") returns one query per (provider, keyword) tuple formatted as GitHub search syntax, e.g. `"sk-proj-" in:file`
+ - Test B: BuildQueries(reg, "gitlab") returns queries formatted for GitLab search syntax (raw keyword, no `in:file`)
+ - Test C: BuildQueries(reg, "huggingface") returns bare keyword queries
+ - Test D: Unknown source name returns bare keyword queries (safe default)
+ - Test E: Providers with empty Keywords slice are skipped
+ - Test F: Keyword dedup — if two providers share keyword, emit once per source
+ - Test G: RegisterAll(nil, cfg) is a no-op that does not panic; RegisterAll with empty cfg does not panic
+
+
+ Create `pkg/recon/sources/queries.go`:
+ ```go
+ package sources
+
+ import (
+ "fmt"
+ "sort"
+
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ )
+
+ // BuildQueries produces the search-string list a source should iterate for a
+ // given provider registry. Each keyword is formatted per source-specific syntax.
+ // Result is deterministic (sorted) for reproducible tests.
+ func BuildQueries(reg *providers.Registry, source string) []string {
+ if reg == nil { return nil }
+ seen := make(map[string]struct{})
+ for _, p := range reg.List() {
+ for _, k := range p.Keywords {
+ if k == "" { continue }
+ seen[k] = struct{}{}
+ }
+ }
+ keywords := make([]string, 0, len(seen))
+ for k := range seen { keywords = append(keywords, k) }
+ sort.Strings(keywords)
+
+ out := make([]string, 0, len(keywords))
+ for _, k := range keywords {
+ out = append(out, formatQuery(source, k))
+ }
+ return out
+ }
+
+ func formatQuery(source, keyword string) string {
+ switch source {
+ case "github", "gist":
+ return fmt.Sprintf("%q in:file", keyword)
+ case "gitlab":
+ return keyword // GitLab code search doesn't support in:file qualifier
+ case "bitbucket":
+ return keyword
+ case "codeberg":
+ return keyword
+ default:
+ return keyword
+ }
+ }
+ ```
+
+ Create `pkg/recon/sources/queries_test.go` using `providers.NewRegistryFromProviders`
+ with two synthetic providers (shared keyword to test dedup).
+
+ Create `pkg/recon/sources/register.go`:
+ ```go
+ package sources
+
+ import (
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+ )
+
+ // SourcesConfig carries per-source credentials read from viper/env by cmd/recon.go.
+ // Plan 10-09 fleshes this out; for now it is a placeholder struct so downstream
+ // plans can depend on its shape.
+ type SourcesConfig struct {
+ GitHubToken string
+ GitLabToken string
+ BitbucketToken string
+ HuggingFaceToken string
+ KaggleUser string
+ KaggleKey string
+ Registry *providers.Registry
+ Limiters *recon.LimiterRegistry
+ }
+
+ // RegisterAll registers every Phase 10 code-hosting source on engine.
+ // Wave 2 plans append their source constructors here via additional
+ // registerXxx helpers in this file. Plan 10-09 writes the final list.
+ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
+ if engine == nil { return }
+ // Populated by Plan 10-09 (after Wave 2 lands individual source files).
+ }
+ ```
+
+ Do NOT wire this into cmd/recon.go yet — Plan 10-09 handles CLI integration after
+ every source exists.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestBuildQueries|TestRegisterAll" -v -timeout 30s && go build ./...
+
+
+ BuildQueries is deterministic, dedups keywords, formats per-source syntax.
+ RegisterAll compiles as a no-op stub. Package builds with zero source
+ implementations — ready for Wave 2 plans to add files in parallel.
+
+
+
+
+
+
+- `go build ./...` succeeds
+- `go test ./pkg/recon/sources/...` passes
+- `go vet ./pkg/recon/sources/...` clean
+
+
+
+pkg/recon/sources package exists with httpclient.go, queries.go, register.go, doc.go
+and all tests green. No source implementations present yet — that is Wave 2.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-02-PLAN.md b/.planning/phases/10-osint-code-hosting/10-02-PLAN.md
new file mode 100644
index 0000000..c9dff5f
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-02-PLAN.md
@@ -0,0 +1,238 @@
+---
+phase: 10-osint-code-hosting
+plan: 02
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/github.go
+ - pkg/recon/sources/github_test.go
+autonomous: true
+requirements: [RECON-CODE-01]
+must_haves:
+ truths:
+ - "GitHubSource.Sweep runs BuildQueries against GitHub /search/code and emits engine.Finding per match"
+ - "GitHubSource is disabled when cfg token is empty (logs and returns nil, no error)"
+ - "GitHubSource honors ctx cancellation mid-query and rate limiter tokens before each request"
+ - "Each Finding has SourceType=\"recon:github\" and Source = html_url"
+ artifacts:
+ - path: "pkg/recon/sources/github.go"
+ provides: "GitHubSource implementing recon.ReconSource"
+ contains: "func (s *GitHubSource) Sweep"
+ - path: "pkg/recon/sources/github_test.go"
+ provides: "httptest-driven unit tests"
+ key_links:
+ - from: "pkg/recon/sources/github.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do"
+ pattern: "c\\.client\\.Do"
+ - from: "pkg/recon/sources/github.go"
+ to: "pkg/recon/sources/queries.go"
+ via: "BuildQueries(reg, \"github\")"
+ pattern: "BuildQueries"
+---
+
+
+Implement GitHubSource — the first real Phase 10 recon source. Refactors logic from
+pkg/dorks/github.go (Phase 8's GitHubExecutor) into a recon.ReconSource. Emits
+engine.Finding entries for every /search/code match, driven by provider keyword
+queries from pkg/recon/sources/queries.go.
+
+Purpose: RECON-CODE-01 — users can scan GitHub public code for leaked LLM keys.
+Output: pkg/recon/sources/github.go + green tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/limiter.go
+@pkg/dorks/github.go
+@pkg/recon/sources/httpclient.go
+@pkg/recon/sources/queries.go
+@pkg/recon/sources/register.go
+
+
+Reference pkg/dorks/github.go for the response struct shapes (ghSearchResponse,
+ghCodeItem, ghRepository, ghTextMatchEntry) — copy or alias them. GitHub Code Search
+endpoint: GET /search/code?q=&per_page= with headers:
+- Accept: application/vnd.github.v3.text-match+json
+- Authorization: Bearer
+- User-Agent: keyhunter-recon
+
+Rate limit: 30 req/min authenticated → rate.Every(2*time.Second), burst 1.
+
+
+
+
+
+
+ Task 1: GitHubSource implementation + tests
+ pkg/recon/sources/github.go, pkg/recon/sources/github_test.go
+
+ - Test A: Enabled returns false when token empty; true when token set
+ - Test B: Sweep with empty token returns nil (no error, logs disabled)
+ - Test C: Sweep against httptest server decodes a 2-item response, emits 2 Findings on channel with SourceType="recon:github" and Source=html_url
+ - Test D: ProviderName is derived by matching query keyword back to provider via the registry (pass in synthetic registry)
+ - Test E: Ctx cancellation before first request returns ctx.Err()
+ - Test F: 401 from server returns wrapped ErrUnauthorized
+ - Test G: Multiple queries (from BuildQueries) iterate in sorted order
+
+
+ Create `pkg/recon/sources/github.go`:
+ ```go
+ package sources
+
+ import (
+ "context"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "net/http"
+ "net/url"
+ "time"
+
+ "golang.org/x/time/rate"
+
+ "github.com/salvacybersec/keyhunter/pkg/providers"
+ "github.com/salvacybersec/keyhunter/pkg/recon"
+ )
+
+ // GitHubSource implements recon.ReconSource against GitHub Code Search.
+ // RECON-CODE-01.
+ type GitHubSource struct {
+ Token string
+ BaseURL string // default https://api.github.com, overridable for tests
+ Registry *providers.Registry
+ Limiters *recon.LimiterRegistry
+ client *Client
+ }
+
+ // NewGitHubSource constructs a source. If client is nil, NewClient() is used.
+ func NewGitHubSource(token string, reg *providers.Registry, lim *recon.LimiterRegistry) *GitHubSource {
+ return &GitHubSource{Token: token, BaseURL: "https://api.github.com", Registry: reg, Limiters: lim, client: NewClient()}
+ }
+
+ func (s *GitHubSource) Name() string { return "github" }
+ func (s *GitHubSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
+ func (s *GitHubSource) Burst() int { return 1 }
+ func (s *GitHubSource) RespectsRobots() bool { return false }
+ func (s *GitHubSource) Enabled(_ recon.Config) bool { return s.Token != "" }
+
+ func (s *GitHubSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+ if s.Token == "" { return nil }
+ base := s.BaseURL
+ if base == "" { base = "https://api.github.com" }
+
+ queries := BuildQueries(s.Registry, "github")
+ kwToProvider := keywordIndex(s.Registry)
+
+ for _, q := range queries {
+ if err := ctx.Err(); err != nil { return err }
+ if s.Limiters != nil {
+ if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err }
+ }
+ endpoint := fmt.Sprintf("%s/search/code?q=%s&per_page=30", base, url.QueryEscape(q))
+ req, _ := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+ req.Header.Set("Accept", "application/vnd.github.v3.text-match+json")
+ req.Header.Set("Authorization", "Bearer "+s.Token)
+
+ resp, err := s.client.Do(ctx, req)
+ if err != nil {
+ if errors.Is(err, ErrUnauthorized) { return err }
+ // Other errors: log-and-continue per CONTEXT (sources downgrade, not abort)
+ continue
+ }
+ var parsed ghSearchResponse
+ _ = json.NewDecoder(resp.Body).Decode(&parsed)
+ resp.Body.Close()
+
+ provName := kwToProvider[extractKeyword(q)]
+ for _, it := range parsed.Items {
+ snippet := ""
+ if len(it.TextMatches) > 0 { snippet = it.TextMatches[0].Fragment }
+ f := recon.Finding{
+ ProviderName: provName,
+ KeyMasked: "",
+ Confidence: "low",
+ Source: it.HTMLURL,
+ SourceType: "recon:github",
+ DetectedAt: time.Now(),
+ }
+ _ = snippet // reserved for future content scan pass
+ select {
+ case out <- f:
+ case <-ctx.Done(): return ctx.Err()
+ }
+ }
+ }
+ return nil
+ }
+
+ // Response structs mirror pkg/dorks/github.go (kept private to this file
+ // to avoid cross-package coupling between dorks and recon/sources).
+ type ghSearchResponse struct { Items []ghCodeItem `json:"items"` }
+ type ghCodeItem struct {
+ HTMLURL string `json:"html_url"`
+ Repository ghRepository `json:"repository"`
+ TextMatches []ghTextMatchEntry `json:"text_matches"`
+ }
+ type ghRepository struct { FullName string `json:"full_name"` }
+ type ghTextMatchEntry struct { Fragment string `json:"fragment"` }
+
+ // keywordIndex maps keyword -> provider name using the registry.
+ func keywordIndex(reg *providers.Registry) map[string]string {
+ m := make(map[string]string)
+ if reg == nil { return m }
+ for _, p := range reg.List() {
+ for _, k := range p.Keywords { m[k] = p.Name }
+ }
+ return m
+ }
+
+ // extractKeyword parses the provider keyword out of a BuildQueries output.
+ // For github it's `"keyword" in:file`; for bare formats it's the whole string.
+ func extractKeyword(q string) string { ... strip quotes, trim ` in:file` suffix ... }
+ ```
+
+ Create `pkg/recon/sources/github_test.go`:
+ - Use `providers.NewRegistryFromProviders` with 2 synthetic providers (openai/sk-proj-, anthropic/sk-ant-)
+ - Spin up `httptest.NewServer` that inspects `r.URL.Query().Get("q")` and returns
+ a JSON body with two items whose html_url encodes the query
+ - Assert 2 findings per query received on the channel within 2s using select/time.After
+ - Separate test for empty token: NewGitHubSource("", reg, lim).Sweep returns nil immediately
+ - Separate test for 401: server returns 401 → Sweep returns error wrapping ErrUnauthorized
+ - Cancel-test: cancel ctx before Sweep call; assert ctx.Err() returned
+
+ Leave GitHubSource unregistered (Plan 10-09 adds it to RegisterAll).
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGitHub -v -timeout 30s
+
+
+ GitHubSource satisfies recon.ReconSource (compile-time assert via `var _ recon.ReconSource = (*GitHubSource)(nil)`),
+ tests green, covers happy path + empty token + 401 + cancellation.
+
+
+
+
+
+
+- `go build ./...`
+- `go test ./pkg/recon/sources/ -run TestGitHub -v`
+- `go vet ./pkg/recon/sources/...`
+
+
+
+RECON-CODE-01 satisfied: GitHubSource queries /search/code using provider-registry-driven
+keywords and emits engine.Finding. Ready for registration in Plan 10-09.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-03-PLAN.md b/.planning/phases/10-osint-code-hosting/10-03-PLAN.md
new file mode 100644
index 0000000..f6da797
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-03-PLAN.md
@@ -0,0 +1,120 @@
+---
+phase: 10-osint-code-hosting
+plan: 03
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/gitlab.go
+ - pkg/recon/sources/gitlab_test.go
+autonomous: true
+requirements: [RECON-CODE-02]
+must_haves:
+ truths:
+ - "GitLabSource.Sweep queries GitLab /api/v4/search?scope=blobs and emits Findings"
+ - "Disabled when token empty; enabled otherwise"
+ - "Findings have SourceType=\"recon:gitlab\" and Source = web_url of blob"
+ artifacts:
+ - path: "pkg/recon/sources/gitlab.go"
+ provides: "GitLabSource implementing recon.ReconSource"
+ - path: "pkg/recon/sources/gitlab_test.go"
+ provides: "httptest tests"
+ key_links:
+ - from: "pkg/recon/sources/gitlab.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "c.client.Do(ctx, req)"
+ pattern: "client\\.Do"
+---
+
+
+Implement GitLabSource against GitLab's Search API (/api/v4/search?scope=blobs).
+Honors PRIVATE-TOKEN header auth, 2000 req/min rate limit.
+
+Purpose: RECON-CODE-02.
+Output: pkg/recon/sources/gitlab.go + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/sources/httpclient.go
+@pkg/recon/sources/queries.go
+
+
+GitLab Search API (docs: https://docs.gitlab.com/ee/api/search.html):
+ GET /api/v4/search?scope=blobs&search=&per_page=20
+ Header: PRIVATE-TOKEN:
+Response (array of blob objects):
+ [{ "basename": "...", "data": "matched snippet", "path": "...", "project_id": 123,
+ "ref": "main", "startline": 42 }, ...]
+Project web_url must be constructed from project_id → fetch /api/v4/projects/ (or
+just use basename+path with a placeholder Source — keep it minimal: Source =
+"https://gitlab.com/projects//-/blob/[/").
+
+Rate limit: 2000 req/min → rate.Every(30 * time.Millisecond) ≈ 2000/min, burst 5.
+]
+
+
+
+
+
+ Task 1: GitLabSource implementation + tests
+ pkg/recon/sources/gitlab.go, pkg/recon/sources/gitlab_test.go
+
+ - Test A: Enabled false when token empty
+ - Test B: Sweep queries /api/v4/search with scope=blobs, PRIVATE-TOKEN header set
+ - Test C: Decodes array response, emits one Finding per blob with Source containing project_id + path + ref
+ - Test D: 401 returns wrapped ErrUnauthorized
+ - Test E: Ctx cancellation respected
+ - Test F: Empty token → Sweep returns nil with no calls
+
+
+ Create `pkg/recon/sources/gitlab.go` with struct `GitLabSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`.
+
+ Default BaseURL: `https://gitlab.com`.
+ Name: "gitlab". RateLimit: `rate.Every(30 * time.Millisecond)`. Burst: 5. RespectsRobots: false.
+
+ Sweep loop:
+ - For each query from BuildQueries(reg, "gitlab"):
+ - Build `base + /api/v4/search?scope=blobs&search=&per_page=20`
+ - Set header `PRIVATE-TOKEN: `
+ - limiters.Wait, then client.Do
+ - Decode `[]glBlob` where glBlob has ProjectID int, Path, Ref, Data, Startline
+ - Emit Finding with Source = fmt.Sprintf("%s/projects/%d/-/blob/%s/%s", base, b.ProjectID, b.Ref, b.Path), SourceType="recon:gitlab", Confidence="low", ProviderName derived via keywordIndex(reg)
+ - Respect ctx.Done on send
+
+ Add compile-time assert: `var _ recon.ReconSource = (*GitLabSource)(nil)`.
+
+ Create `pkg/recon/sources/gitlab_test.go` with httptest server returning a JSON
+ array of two blob objects. Assert both Findings received, Source URLs contain
+ project IDs, ctx cancellation test, 401 test, empty-token test. Use synthetic
+ registry with 2 providers.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGitLab -v -timeout 30s
+
+
+ GitLabSource compiles, implements ReconSource, all test behaviors covered.
+
+
+
+
+
+
+- `go build ./...`
+- `go test ./pkg/recon/sources/ -run TestGitLab -v`
+
+
+
+RECON-CODE-02 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-04-PLAN.md b/.planning/phases/10-osint-code-hosting/10-04-PLAN.md
new file mode 100644
index 0000000..fd658f7
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-04-PLAN.md
@@ -0,0 +1,163 @@
+---
+phase: 10-osint-code-hosting
+plan: 04
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/bitbucket.go
+ - pkg/recon/sources/bitbucket_test.go
+ - pkg/recon/sources/gist.go
+ - pkg/recon/sources/gist_test.go
+autonomous: true
+requirements: [RECON-CODE-03, RECON-CODE-04]
+must_haves:
+ truths:
+ - "BitbucketSource queries Bitbucket 2.0 code search API and emits Findings"
+ - "GistSource queries GitHub Gist search (re-uses GitHub token) and emits Findings"
+ - "Both disabled when respective credentials are empty"
+ artifacts:
+ - path: "pkg/recon/sources/bitbucket.go"
+ provides: "BitbucketSource implementing recon.ReconSource"
+ - path: "pkg/recon/sources/gist.go"
+ provides: "GistSource implementing recon.ReconSource"
+ key_links:
+ - from: "pkg/recon/sources/gist.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do with Bearer "
+ pattern: "client\\.Do"
+ - from: "pkg/recon/sources/bitbucket.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do"
+ pattern: "client\\.Do"
+---
+
+
+Implement BitbucketSource (RECON-CODE-03) and GistSource (RECON-CODE-04). Grouped
+because both are small API integrations with similar shapes (JSON array/values,
+per-item URL, token gating).
+
+Purpose: RECON-CODE-03, RECON-CODE-04.
+Output: Two new ReconSource implementations + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/sources/httpclient.go
+@pkg/recon/sources/queries.go
+
+
+Bitbucket 2.0 search (docs: https://developer.atlassian.com/cloud/bitbucket/rest/api-group-search/):
+ GET /2.0/workspaces/{workspace}/search/code?search_query=
+ Auth: Bearer (app password or OAuth)
+ Response: { "values": [{ "content_match_count": N, "file": {"path":"","commit":{...}}, "page_url": "..." }] }
+ Note: Requires a workspace param — make it configurable via SourcesConfig.BitbucketWorkspace;
+ if unset, source is disabled. Rate: 1000/hour → rate.Every(3.6 * time.Second), burst 1.
+
+GitHub Gist search: GitHub does not expose a dedicated /search/gists endpoint that
+searches gist contents. Use the /gists/public endpoint + client-side filtering as
+fallback: GET /gists/public?per_page=100 returns public gists; for each gist, fetch
+/gists/{id} and scan file contents for keyword matches. Keep implementation minimal:
+just enumerate the first page, match against keyword list, emit Findings with
+Source = gist.html_url. Auth: Bearer . Rate: 30/min → rate.Every(2s).
+
+
+
+
+
+
+ Task 1: BitbucketSource + tests
+ pkg/recon/sources/bitbucket.go, pkg/recon/sources/bitbucket_test.go
+
+ - Test A: Enabled false when token OR workspace empty
+ - Test B: Enabled true when both set
+ - Test C: Sweep queries /2.0/workspaces/{ws}/search/code with Bearer header
+ - Test D: Decodes `{values:[{file:{path,commit:{...}},page_url:"..."}]}` and emits Finding with Source=page_url, SourceType="recon:bitbucket"
+ - Test E: 401 → ErrUnauthorized
+ - Test F: Ctx cancellation
+
+
+ Create `pkg/recon/sources/bitbucket.go`:
+ - Struct `BitbucketSource { Token, Workspace, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Default BaseURL: `https://api.bitbucket.org`
+ - Name "bitbucket", RateLimit rate.Every(3600*time.Millisecond), Burst 1, RespectsRobots false
+ - Enabled = s.Token != "" && s.Workspace != ""
+ - Sweep: for each query in BuildQueries(reg, "bitbucket"), limiters.Wait, issue
+ GET request, decode into struct with `Values []struct{ PageURL string "json:page_url"; File struct{ Path string } "json:file" }`, emit Findings
+ - Compile-time assert `var _ recon.ReconSource = (*BitbucketSource)(nil)`
+
+ Create `pkg/recon/sources/bitbucket_test.go` with httptest server, synthetic
+ registry, assertions on URL path `/2.0/workspaces/testws/search/code`, Bearer
+ header, and emitted Findings.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestBitbucket -v -timeout 30s
+
+
+ BitbucketSource passes all tests, implements ReconSource.
+
+
+
+
+ Task 2: GistSource + tests
+ pkg/recon/sources/gist.go, pkg/recon/sources/gist_test.go
+
+ - Test A: Enabled false when GitHub token empty
+ - Test B: Sweep fetches /gists/public?per_page=100 with Bearer auth
+ - Test C: For each gist, iterates files map; if any file.content contains a provider keyword, emits one Finding with Source=gist.html_url
+ - Test D: Ctx cancellation
+ - Test E: 401 → ErrUnauthorized
+ - Test F: Gist without matching keyword → no Finding emitted
+
+
+ Create `pkg/recon/sources/gist.go`:
+ - Struct `GistSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - BaseURL default `https://api.github.com`
+ - Name "gist", RateLimit rate.Every(2*time.Second), Burst 1, RespectsRobots false
+ - Enabled = s.Token != ""
+ - Sweep flow:
+ 1. Build keyword list from registry (flat set)
+ 2. GET /gists/public?per_page=100 with Bearer header
+ 3. Decode `[]struct{ HTMLURL string "json:html_url"; Files map[string]struct{ Filename, RawURL string "json:raw_url" } "json:files" }`
+ 4. For each gist, for each file, if we can match without fetching raw content,
+ skip raw fetch (keep Phase 10 minimal). Fallback: fetch file.RawURL and
+ scan content for any keyword from the set; on hit, emit one Finding
+ per gist (not per file) with ProviderName from matched keyword.
+ 5. Respect limiters.Wait before each outbound request (gist list + each raw fetch)
+ - Compile-time assert `var _ recon.ReconSource = (*GistSource)(nil)`
+
+ Create `pkg/recon/sources/gist_test.go`:
+ - httptest server with two routes: `/gists/public` returns 2 gists each with 1 file, raw_url pointing to same server `/raw/`; `/raw/` returns content containing "sk-proj-" for one and an unrelated string for the other
+ - Assert exactly 1 Finding emitted, Source matches the gist's html_url
+ - 401 test, ctx cancellation test, empty-token test
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGist -v -timeout 30s
+
+
+ GistSource emits Findings only when a known provider keyword is present in a gist
+ file body; all tests green.
+
+
+
+
+
+
+- `go build ./...`
+- `go test ./pkg/recon/sources/ -run "TestBitbucket|TestGist" -v`
+
+
+
+RECON-CODE-03 and RECON-CODE-04 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-05-PLAN.md b/.planning/phases/10-osint-code-hosting/10-05-PLAN.md
new file mode 100644
index 0000000..5eb92da
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-05-PLAN.md
@@ -0,0 +1,113 @@
+---
+phase: 10-osint-code-hosting
+plan: 05
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/codeberg.go
+ - pkg/recon/sources/codeberg_test.go
+autonomous: true
+requirements: [RECON-CODE-05]
+must_haves:
+ truths:
+ - "CodebergSource queries Gitea REST API /api/v1/repos/search and /api/v1/repos/.../contents for keyword matches"
+ - "No token required for public repos (but optional token honored if provided)"
+ - "Findings tagged SourceType=\"recon:codeberg\""
+ artifacts:
+ - path: "pkg/recon/sources/codeberg.go"
+ provides: "CodebergSource implementing recon.ReconSource (Gitea-compatible)"
+ key_links:
+ - from: "pkg/recon/sources/codeberg.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do"
+ pattern: "client\\.Do"
+---
+
+
+Implement CodebergSource targeting Gitea's REST API. Codeberg.org runs Gitea, so the
+same code works for any Gitea instance by configuring BaseURL. Public repos do not
+require auth, but a token can be passed to raise rate limits.
+
+Purpose: RECON-CODE-05.
+Output: pkg/recon/sources/codeberg.go + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/sources/httpclient.go
+
+
+Gitea API (v1, docs: https://docs.gitea.com/api):
+ GET /api/v1/repos/search?q=&limit=50
+ Response: { "data": [{ "full_name": "...", "html_url": "..." }], "ok": true }
+ Header (optional): Authorization: token
+
+For this phase we only use /repos/search — matching on repo metadata (name/description).
+Full-content code search is not uniformly available across Gitea instances (Codeberg
+has gitea "code search" enabled via Bleve index; we rely on it when present via
+GET /api/v1/repos/search?q=... which returns repos only. For content matching we
+fall back to searching each provider keyword as a query string and emitting Findings
+keyed to the repo html_url).
+
+Rate: public unauth 60 req/hour → rate.Every(60 * time.Second). Burst 1.
+With token: 1000/hour → rate.Every(3600 * time.Millisecond). Detect via token presence.
+
+
+
+
+
+
+ Task 1: CodebergSource + tests
+ pkg/recon/sources/codeberg.go, pkg/recon/sources/codeberg_test.go
+
+ - Test A: Enabled always true (public API, token optional)
+ - Test B: Sweep queries /api/v1/repos/search?q=&limit=50 for each BuildQueries entry
+ - Test C: Decodes `{data:[{full_name,html_url}]}` and emits Finding with Source=html_url, SourceType="recon:codeberg", ProviderName from keywordIndex
+ - Test D: With token set, Authorization header is "token "; without token, header absent
+ - Test E: Ctx cancellation
+ - Test F: Unauth rate limit applied when Token empty (verified via RateLimit() return)
+
+
+ Create `pkg/recon/sources/codeberg.go`:
+ - Struct `CodebergSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Default BaseURL: `https://codeberg.org`
+ - Name "codeberg", RespectsRobots false
+ - RateLimit(): if Token == "" return rate.Every(60*time.Second), else rate.Every(3600*time.Millisecond)
+ - Burst 1
+ - Enabled always returns true
+ - Sweep: for each query, build `base + /api/v1/repos/search?q=&limit=50`, set Authorization only when Token set, client.Do, decode, emit Findings
+ - Compile-time assert
+
+ Create `pkg/recon/sources/codeberg_test.go` with httptest server returning a
+ `{data:[...],ok:true}` body. Two test cases: with token (header present) and
+ without (header absent — use a flag inside the handler to capture).
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestCodeberg -v -timeout 30s
+
+
+ CodebergSource implements ReconSource, tests green for both auth modes.
+
+
+
+
+
+
+- `go test ./pkg/recon/sources/ -run TestCodeberg -v`
+
+
+
+RECON-CODE-05 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-06-PLAN.md b/.planning/phases/10-osint-code-hosting/10-06-PLAN.md
new file mode 100644
index 0000000..f26c010
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-06-PLAN.md
@@ -0,0 +1,108 @@
+---
+phase: 10-osint-code-hosting
+plan: 06
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/huggingface.go
+ - pkg/recon/sources/huggingface_test.go
+autonomous: true
+requirements: [RECON-CODE-08]
+must_haves:
+ truths:
+ - "HuggingFaceSource queries /api/spaces and /api/models search endpoints"
+ - "Token is optional — anonymous requests allowed at lower rate limit"
+ - "Findings have SourceType=\"recon:huggingface\" and Source = full HF URL"
+ artifacts:
+ - path: "pkg/recon/sources/huggingface.go"
+ provides: "HuggingFaceSource implementing recon.ReconSource"
+ key_links:
+ - from: "pkg/recon/sources/huggingface.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do"
+ pattern: "client\\.Do"
+---
+
+
+Implement HuggingFaceSource scanning both Spaces and model repos via the HF Hub API.
+Token optional; unauthenticated requests work but are rate-limited harder.
+
+Purpose: RECON-CODE-08.
+Output: pkg/recon/sources/huggingface.go + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/sources/httpclient.go
+
+
+HuggingFace Hub API:
+ GET https://huggingface.co/api/spaces?search=&limit=50
+ GET https://huggingface.co/api/models?search=&limit=50
+ Response (either): array of { "id": "owner/name", "modelId"|"spaceId": "owner/name" }
+ Optional auth: Authorization: Bearer
+
+URL derivation: Source = "https://huggingface.co/spaces/" or ".../" for models.
+
+Rate: 1000/hour authenticated → rate.Every(3600*time.Millisecond); unauth: rate.Every(10*time.Second), burst 1.
+
+
+
+
+
+
+ Task 1: HuggingFaceSource + tests
+ pkg/recon/sources/huggingface.go, pkg/recon/sources/huggingface_test.go
+
+ - Test A: Enabled always true (token optional)
+ - Test B: Sweep hits both /api/spaces and /api/models endpoints for each query
+ - Test C: Decodes array of {id} and emits Findings with Source prefixed by "https://huggingface.co/spaces/" or "https://huggingface.co/" for models, SourceType="recon:huggingface"
+ - Test D: Authorization header present when token set, absent when empty
+ - Test E: Ctx cancellation respected
+ - Test F: RateLimit returns slower rate when token empty
+
+
+ Create `pkg/recon/sources/huggingface.go`:
+ - Struct `HuggingFaceSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Default BaseURL: `https://huggingface.co`
+ - Name "huggingface", RespectsRobots false, Burst 1
+ - RateLimit: token-dependent (see interfaces)
+ - Enabled always true
+ - Sweep: build keyword list, for each keyword iterate two endpoints
+ (`/api/spaces?search=&limit=50`, `/api/models?search=&limit=50`), emit
+ Findings. URL prefix differs per endpoint.
+ - Compile-time assert
+
+ Create `pkg/recon/sources/huggingface_test.go` with httptest server that routes
+ both paths. Assert exact number of Findings (2 per keyword × number of keywords)
+ and URL prefixes.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestHuggingFace -v -timeout 30s
+
+
+ HuggingFaceSource passes tests covering both endpoints, token modes, cancellation.
+
+
+
+
+
+
+- `go test ./pkg/recon/sources/ -run TestHuggingFace -v`
+
+
+
+RECON-CODE-08 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-07-PLAN.md b/.planning/phases/10-osint-code-hosting/10-07-PLAN.md
new file mode 100644
index 0000000..5a21bc0
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-07-PLAN.md
@@ -0,0 +1,191 @@
+---
+phase: 10-osint-code-hosting
+plan: 07
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/replit.go
+ - pkg/recon/sources/replit_test.go
+ - pkg/recon/sources/codesandbox.go
+ - pkg/recon/sources/codesandbox_test.go
+ - pkg/recon/sources/sandboxes.go
+ - pkg/recon/sources/sandboxes_test.go
+autonomous: true
+requirements: [RECON-CODE-06, RECON-CODE-07, RECON-CODE-10]
+must_haves:
+ truths:
+ - "ReplitSource scrapes replit.com search HTML and emits Findings tagged recon:replit"
+ - "CodeSandboxSource scrapes codesandbox.io search and emits Findings tagged recon:codesandbox"
+ - "SandboxesSource aggregates JSFiddle+CodePen+StackBlitz+Glitch+Observable+Gitpod with SourceType recon:sandboxes and sub-type in KeyMasked metadata slot"
+ - "All three RespectsRobots()==true and rate-limit conservatively (10/min)"
+ artifacts:
+ - path: "pkg/recon/sources/replit.go"
+ provides: "ReplitSource (scraper)"
+ - path: "pkg/recon/sources/codesandbox.go"
+ provides: "CodeSandboxSource (scraper)"
+ - path: "pkg/recon/sources/sandboxes.go"
+ provides: "SandboxesSource aggregator (JSFiddle, CodePen, StackBlitz, Glitch, Observable, Gitpod)"
+ key_links:
+ - from: "pkg/recon/sources/replit.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do on https://replit.com/search?q=..."
+ pattern: "client\\.Do"
+ - from: "pkg/recon/sources/sandboxes.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do on per-sandbox search URLs"
+ pattern: "client\\.Do"
+---
+
+
+Implement three scraping-based sources for sandbox/IDE platforms without public
+search APIs. All three honor robots.txt, use a conservative 10 req/min rate, and
+emit Findings with best-effort HTML link extraction.
+
+Purpose: RECON-CODE-06 (Replit), RECON-CODE-07 (CodeSandbox), RECON-CODE-10
+(CodePen/JSFiddle/StackBlitz/Glitch/Observable/Gitpod aggregator).
+Output: 3 new ReconSource implementations + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/robots.go
+@pkg/recon/sources/httpclient.go
+
+
+Scraping strategy (identical for all three sources in this plan):
+1. Build per-provider keyword queries via BuildQueries (default format = bare keyword)
+2. Fetch search URL via Client.Do (no auth headers)
+3. Use a simple regex to extract result links from HTML (href="/@user/repl-name"
+ or href="/s/...") — use net/html parser for robustness
+4. Emit one Finding per extracted link with SourceType="recon:" and Source=absolute URL
+5. Return early on ctx cancellation
+
+Search URLs (approximations — confirm in action):
+- Replit: https://replit.com/search?q=&type=repls
+- CodeSandbox: https://codesandbox.io/search?query=&type=sandboxes
+- CodePen: https://codepen.io/search/pens?q=
+- JSFiddle: https://jsfiddle.net/api/search/?q= (returns JSON)
+- StackBlitz: https://stackblitz.com/search?q=
+- Glitch: https://glitch.com/api/search/projects?q=
+- Observable: https://observablehq.com/search?query=
+- Gitpod: https://www.gitpod.io/ (no public search; skip with log)
+
+All three sources set RespectsRobots()=true. Engine honors this via existing
+pkg/recon/robots.go cache (caller coordinates RobotsCache check; not done here
+because Phase 9 wires it at SweepAll level — if not, document TODO in code).
+
+Rate limits: all 10 req/min → rate.Every(6 * time.Second). Burst 1.
+
+
+
+
+
+
+ Task 1: ReplitSource + CodeSandboxSource (scrapers)
+ pkg/recon/sources/replit.go, pkg/recon/sources/replit_test.go, pkg/recon/sources/codesandbox.go, pkg/recon/sources/codesandbox_test.go
+
+ - Test A (each): Sweep fetches search URL for each keyword via httptest server
+ - Test B: HTML parsing extracts anchor hrefs matching expected result patterns (use golang.org/x/net/html)
+ - Test C: Each extracted link emitted as Finding with Source=absolute URL, SourceType="recon:replit" or "recon:codesandbox"
+ - Test D: RespectsRobots returns true
+ - Test E: Ctx cancellation respected
+ - Test F: Enabled always returns true (no auth)
+
+
+ Add `golang.org/x/net/html` to go.mod if not already (`go get golang.org/x/net/html`).
+
+ Create `pkg/recon/sources/replit.go`:
+ - Struct `ReplitSource { BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Default BaseURL: `https://replit.com`
+ - Name "replit", RateLimit rate.Every(6*time.Second), Burst 1, RespectsRobots true, Enabled always true
+ - Sweep: for each keyword from BuildQueries, GET `{base}/search?q={keyword}&type=repls`, parse HTML with `html.Parse`, walk DOM collecting `` matching regex `^/@[^/]+/[^/]+$` (repl URLs), emit Finding per absolute URL
+ - Compile-time assert
+
+ Create `pkg/recon/sources/replit_test.go`:
+ - httptest server returning fixed HTML snippet with 2 matching anchors + 1 non-matching
+ - Assert exactly 2 Findings with correct absolute URLs
+
+ Create `pkg/recon/sources/codesandbox.go` with same shape but:
+ - Default BaseURL `https://codesandbox.io`
+ - Name "codesandbox"
+ - Search URL: `{base}/search?query=&type=sandboxes`
+ - Link regex: `^/s/[a-zA-Z0-9-]+$` or `/p/sandbox/...`
+ - SourceType "recon:codesandbox"
+
+ Create `pkg/recon/sources/codesandbox_test.go` analogous to replit_test.go.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestReplit|TestCodeSandbox" -v -timeout 30s
+
+
+ Both scrapers parse HTML, extract links, emit Findings; tests green.
+
+
+
+
+ Task 2: SandboxesSource aggregator (JSFiddle/CodePen/StackBlitz/Glitch/Observable/Gitpod)
+ pkg/recon/sources/sandboxes.go, pkg/recon/sources/sandboxes_test.go
+
+ - Test A: Sweep iterates 6 sub-platforms for each keyword (via test override of Platforms slice)
+ - Test B: JSFiddle returns JSON → parsed into Findings (Source from result URLs)
+ - Test C: CodePen HTML → anchor extraction
+ - Test D: One failing sub-platform does NOT abort others (log-and-continue)
+ - Test E: SourceType = "recon:sandboxes"; sub-platform identifier goes into Confidence field or separate Platform map slot (use `KeyMasked` sentinel `platform=codepen` for now — pragmatic placeholder until a Metadata field exists)
+ - Test F: Ctx cancellation
+
+
+ Create `pkg/recon/sources/sandboxes.go`:
+ - Define `subPlatform` struct: `{ Name, SearchURL, ResultLinkRegex string; IsJSON bool; JSONItemsKey string }`
+ - Default Platforms:
+ ```go
+ var defaultPlatforms = []subPlatform{
+ {Name: "codepen", SearchURL: "https://codepen.io/search/pens?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+`, IsJSON: false},
+ {Name: "jsfiddle", SearchURL: "https://jsfiddle.net/api/search/?q=%s", IsJSON: true, JSONItemsKey: "results"},
+ {Name: "stackblitz", SearchURL: "https://stackblitz.com/search?q=%s", ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+`, IsJSON: false},
+ {Name: "glitch", SearchURL: "https://glitch.com/api/search/projects?q=%s", IsJSON: true, JSONItemsKey: "results"},
+ {Name: "observable", SearchURL: "https://observablehq.com/search?query=%s", ResultLinkRegex: `^/@[^/]+/[^/]+`, IsJSON: false},
+ }
+ ```
+ (Gitpod omitted — no public search; document in comment.)
+ - Struct `SandboxesSource { Platforms []subPlatform; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Name "sandboxes", RateLimit rate.Every(6*time.Second), Burst 1, RespectsRobots true, Enabled always true
+ - Sweep: for each platform, for each keyword, fetch URL, parse either JSON or HTML, emit Findings with Source=absolute URL and KeyMasked="platform="+p.Name
+ - On any per-platform error, log (use stdlib log package) and continue
+
+ Create `pkg/recon/sources/sandboxes_test.go`:
+ - Spin up a single httptest server; override Platforms slice with 2 platforms
+ pointing at `/codepen-search` (HTML) and `/jsfiddle-search` (JSON)
+ - Assert Findings from both platforms emitted
+ - Failure test: one platform returns 500 → log-and-continue, other still emits
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestSandboxes -v -timeout 30s
+
+
+ SandboxesSource iterates sub-platforms, handles HTML and JSON formats, tolerates
+ per-platform failure, emits Findings tagged with platform identifier.
+
+
+
+
+
+
+- `go build ./...`
+- `go test ./pkg/recon/sources/ -run "TestReplit|TestCodeSandbox|TestSandboxes" -v`
+
+
+
+RECON-CODE-06, RECON-CODE-07, RECON-CODE-10 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-08-PLAN.md b/.planning/phases/10-osint-code-hosting/10-08-PLAN.md
new file mode 100644
index 0000000..aab06c4
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-08-PLAN.md
@@ -0,0 +1,109 @@
+---
+phase: 10-osint-code-hosting
+plan: 08
+type: execute
+wave: 2
+depends_on: [10-01]
+files_modified:
+ - pkg/recon/sources/kaggle.go
+ - pkg/recon/sources/kaggle_test.go
+autonomous: true
+requirements: [RECON-CODE-09]
+must_haves:
+ truths:
+ - "KaggleSource queries Kaggle public API /api/v1/kernels/list with Basic auth (username:key) and emits Findings"
+ - "Disabled when either KaggleUser or KaggleKey is empty"
+ - "Findings tagged recon:kaggle; Source = https://www.kaggle.com/code/["
+ artifacts:
+ - path: "pkg/recon/sources/kaggle.go"
+ provides: "KaggleSource implementing recon.ReconSource"
+ key_links:
+ - from: "pkg/recon/sources/kaggle.go"
+ to: "pkg/recon/sources/httpclient.go"
+ via: "Client.Do with req.SetBasicAuth(user, key)"
+ pattern: "SetBasicAuth"
+---
+
+
+Implement KaggleSource querying Kaggle's public REST API for public notebooks
+(kernels). Kaggle uses HTTP Basic auth (username + API key from kaggle.json).
+
+Purpose: RECON-CODE-09.
+Output: pkg/recon/sources/kaggle.go + tests.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@pkg/recon/source.go
+@pkg/recon/sources/httpclient.go
+
+
+Kaggle API (docs: https://www.kaggle.com/docs/api):
+ GET https://www.kaggle.com/api/v1/kernels/list?search=]&pageSize=50
+ Auth: HTTP Basic (username:key)
+ Response: array of { "ref": "owner/kernel-slug", "title": "...", "author": "..." }
+ URL derivation: https://www.kaggle.com/code/[
+
+Rate limit: 60/min → rate.Every(1*time.Second), burst 1.
+]
+
+
+
+
+
+ Task 1: KaggleSource + tests
+ pkg/recon/sources/kaggle.go, pkg/recon/sources/kaggle_test.go
+
+ - Test A: Enabled false when User empty; false when Key empty; true when both set
+ - Test B: Sweep sets Basic auth header via req.SetBasicAuth(user, key)
+ - Test C: Decodes array of {ref} → Findings with Source = baseURL + "/code/" + ref, SourceType="recon:kaggle"
+ - Test D: 401 → ErrUnauthorized
+ - Test E: Ctx cancellation
+ - Test F: Missing creds → Sweep returns nil immediately (no HTTP calls, verified via counter=0)
+
+
+ Create `pkg/recon/sources/kaggle.go`:
+ - Struct `KaggleSource { User, Key, BaseURL, WebBaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`
+ - Default BaseURL `https://www.kaggle.com`, WebBaseURL same
+ - Name "kaggle", RateLimit rate.Every(1*time.Second), Burst 1, RespectsRobots false
+ - Enabled = s.User != "" && s.Key != ""
+ - Sweep: for each query from BuildQueries(reg, "kaggle"), build
+ `{base}/api/v1/kernels/list?search=&pageSize=50`, call req.SetBasicAuth(User, Key),
+ client.Do, decode `[]struct{ Ref string "json:ref" }`, emit Findings
+ - Compile-time assert
+
+ Create `pkg/recon/sources/kaggle_test.go`:
+ - httptest server that validates Authorization header starts with "Basic " and
+ decodes to "testuser:testkey"
+ - Returns JSON array with 2 refs
+ - Assert 2 Findings with expected Source URLs
+ - Missing-creds test: Sweep returns nil, handler never called (use atomic counter)
+ - 401 and cancellation tests
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestKaggle -v -timeout 30s
+
+
+ KaggleSource passes all tests, implements ReconSource.
+
+
+
+
+
+
+- `go test ./pkg/recon/sources/ -run TestKaggle -v`
+
+
+
+RECON-CODE-09 satisfied.
+
+
+
diff --git a/.planning/phases/10-osint-code-hosting/10-09-PLAN.md b/.planning/phases/10-osint-code-hosting/10-09-PLAN.md
new file mode 100644
index 0000000..984ccde
--- /dev/null
+++ b/.planning/phases/10-osint-code-hosting/10-09-PLAN.md
@@ -0,0 +1,227 @@
+---
+phase: 10-osint-code-hosting
+plan: 09
+type: execute
+wave: 3
+depends_on: [10-01, 10-02, 10-03, 10-04, 10-05, 10-06, 10-07, 10-08]
+files_modified:
+ - pkg/recon/sources/register.go
+ - pkg/recon/sources/register_test.go
+ - pkg/recon/sources/integration_test.go
+ - cmd/recon.go
+autonomous: true
+requirements: []
+must_haves:
+ truths:
+ - "RegisterAll wires all 10 Phase 10 sources onto a recon.Engine"
+ - "cmd/recon.go buildReconEngine() reads viper config + env vars for tokens and calls RegisterAll"
+ - "Integration test spins up httptest servers for all sources, runs SweepAll via Engine, asserts Findings from each source arrive with correct SourceType"
+ - "Guardrail: enabling a source without its required credential logs a skip but does not error"
+ artifacts:
+ - path: "pkg/recon/sources/register.go"
+ provides: "RegisterAll with 10 source constructors wired"
+ contains: "engine.Register"
+ - path: "pkg/recon/sources/integration_test.go"
+ provides: "End-to-end SweepAll test with httptest fixtures for every source"
+ - path: "cmd/recon.go"
+ provides: "CLI reads config and invokes sources.RegisterAll"
+ key_links:
+ - from: "cmd/recon.go"
+ to: "pkg/recon/sources.RegisterAll"
+ via: "sources.RegisterAll(eng, cfg)"
+ pattern: "sources\\.RegisterAll"
+ - from: "pkg/recon/sources/register.go"
+ to: "pkg/recon.Engine.Register"
+ via: "engine.Register(source)"
+ pattern: "engine\\.Register"
+---
+
+
+Final Wave 3 plan: wire every Phase 10 source into `sources.RegisterAll`, update
+`cmd/recon.go` to construct a real `SourcesConfig` from viper/env, and add an
+end-to-end integration test that drives all 10 sources through recon.Engine.SweepAll
+using httptest fixtures.
+
+Purpose: Users can run `keyhunter recon full --sources=github,gitlab,...` and get
+actual findings from any Phase 10 source whose credential is configured.
+Output: Wired register.go + cmd/recon.go + passing integration test.
+
+
+
+@$HOME/.claude/get-shit-done/workflows/execute-plan.md
+@$HOME/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/phases/10-osint-code-hosting/10-CONTEXT.md
+@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-02-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-03-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-04-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-07-SUMMARY.md
+@.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md
+@pkg/recon/engine.go
+@pkg/recon/source.go
+@pkg/providers/registry.go
+@cmd/recon.go
+
+
+After Wave 2, each source file in pkg/recon/sources/ exports a constructor
+roughly of the form:
+ func NewGitHubSource(token, reg, lim) *GitHubSource
+ func NewGitLabSource(token, reg, lim) *GitLabSource
+ func NewBitbucketSource(token, workspace, reg, lim) *BitbucketSource
+ func NewGistSource(token, reg, lim) *GistSource
+ func NewCodebergSource(token, reg, lim) *CodebergSource
+ func NewHuggingFaceSource(token, reg, lim) *HuggingFaceSource
+ func NewReplitSource(reg, lim) *ReplitSource
+ func NewCodeSandboxSource(reg, lim) *CodeSandboxSource
+ func NewSandboxesSource(reg, lim) *SandboxesSource
+ func NewKaggleSource(user, key, reg, lim) *KaggleSource
+
+(Verify actual signatures when reading Wave 2 SUMMARYs before writing register.go.)
+
+
+
+
+
+
+ Task 1: Wire RegisterAll + register_test.go
+ pkg/recon/sources/register.go, pkg/recon/sources/register_test.go
+
+ - Test A: RegisterAll with a fresh engine and empty SourcesConfig registers all 10 sources by name (GitHub/GitLab/Bitbucket/Gist/Codeberg/HuggingFace/Replit/CodeSandbox/Sandboxes/Kaggle)
+ - Test B: engine.List() returns all 10 source names in sorted order
+ - Test C: Calling RegisterAll(nil, cfg) is a no-op (no panic)
+ - Test D: Sources without creds are still registered but their Enabled() returns false
+
+
+ Rewrite `pkg/recon/sources/register.go` RegisterAll body to construct each
+ source with appropriate fields from SourcesConfig and call engine.Register:
+ ```go
+ func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
+ if engine == nil { return }
+ reg := cfg.Registry
+ lim := cfg.Limiters
+ engine.Register(NewGitHubSource(cfg.GitHubToken, reg, lim))
+ engine.Register(NewGitLabSource(cfg.GitLabToken, reg, lim))
+ engine.Register(NewBitbucketSource(cfg.BitbucketToken, cfg.BitbucketWorkspace, reg, lim))
+ engine.Register(NewGistSource(cfg.GitHubToken, reg, lim))
+ engine.Register(NewCodebergSource(cfg.CodebergToken, reg, lim))
+ engine.Register(NewHuggingFaceSource(cfg.HuggingFaceToken, reg, lim))
+ engine.Register(NewReplitSource(reg, lim))
+ engine.Register(NewCodeSandboxSource(reg, lim))
+ engine.Register(NewSandboxesSource(reg, lim))
+ engine.Register(NewKaggleSource(cfg.KaggleUser, cfg.KaggleKey, reg, lim))
+ }
+ ```
+
+ Extend SourcesConfig with any fields Wave 2 introduced (BitbucketWorkspace,
+ CodebergToken). Adjust field names to actual Wave 2 SUMMARY signatures.
+
+ Create `pkg/recon/sources/register_test.go`:
+ - Build minimal registry via providers.NewRegistryFromProviders with 1 synthetic provider
+ - Build recon.Engine, call RegisterAll with cfg having all creds empty
+ - Assert eng.List() returns exactly these 10 names:
+ bitbucket, codeberg, codesandbox, gist, github, gitlab, huggingface, kaggle, replit, sandboxes
+ - Assert nil engine call is no-op (no panic)
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestRegisterAll -v -timeout 30s
+
+
+ RegisterAll wires all 10 sources; registry_test green.
+
+
+
+
+ Task 2: Integration test across all sources + cmd/recon.go wiring
+ pkg/recon/sources/integration_test.go, cmd/recon.go
+
+ - Integration test: spins up 10 httptest servers (or one multiplexed server with per-path routing) that return canned responses for each source's endpoints
+ - Uses BaseURL overrides on each source (direct construction, not RegisterAll, since RegisterAll uses production URLs)
+ - Registers each override-configured source on a fresh recon.Engine and calls SweepAll
+ - Asserts at least 1 Finding emerged for each of the 10 SourceType values: recon:github, recon:gitlab, recon:bitbucket, recon:gist, recon:codeberg, recon:huggingface, recon:replit, recon:codesandbox, recon:sandboxes, recon:kaggle
+ - CLI: `keyhunter recon list` (after wiring) prints all 10 source names in addition to "example"
+
+
+ Create `pkg/recon/sources/integration_test.go`:
+ - Build a single httptest server with a mux routing per-path:
+ `/search/code` (github) → ghSearchResponse JSON
+ `/api/v4/search` (gitlab) → blob array JSON
+ `/2.0/workspaces/ws/search/code` (bitbucket) → values JSON
+ `/gists/public` + `/raw/gist1` (gist) → gist list + raw matching keyword
+ `/api/v1/repos/search` (codeberg) → data array
+ `/api/spaces`, `/api/models` (huggingface) → id arrays
+ `/search?q=...&type=repls` (replit) → HTML fixture
+ `/search?query=...&type=sandboxes` (codesandbox) → HTML fixture
+ `/codepen-search` (sandboxes sub) → HTML; `/jsfiddle-search` → JSON
+ `/api/v1/kernels/list` (kaggle) → ref array
+ - For each source, construct with BaseURL/Platforms overrides pointing at test server
+ - Register all on a fresh recon.Engine
+ - Provide synthetic providers.Registry with keyword "sk-proj-" matching openai
+ - Call eng.SweepAll(ctx, recon.Config{Query:"ignored"})
+ - Assert findings grouped by SourceType covers all 10 expected values
+ - Use a 30s test timeout
+
+ Update `cmd/recon.go`:
+ - Import `github.com/salvacybersec/keyhunter/pkg/recon/sources`, `github.com/spf13/viper`, and the providers package
+ - In `buildReconEngine()`:
+ ```go
+ func buildReconEngine() *recon.Engine {
+ e := recon.NewEngine()
+ e.Register(recon.ExampleSource{})
+ reg, err := providers.NewRegistry()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "recon: failed to load providers: %v\n", err)
+ return e
+ }
+ cfg := sources.SourcesConfig{
+ Registry: reg,
+ Limiters: recon.NewLimiterRegistry(),
+ GitHubToken: firstNonEmpty(os.Getenv("GITHUB_TOKEN"), viper.GetString("recon.github.token")),
+ GitLabToken: firstNonEmpty(os.Getenv("GITLAB_TOKEN"), viper.GetString("recon.gitlab.token")),
+ BitbucketToken: firstNonEmpty(os.Getenv("BITBUCKET_TOKEN"), viper.GetString("recon.bitbucket.token")),
+ BitbucketWorkspace: viper.GetString("recon.bitbucket.workspace"),
+ CodebergToken: firstNonEmpty(os.Getenv("CODEBERG_TOKEN"), viper.GetString("recon.codeberg.token")),
+ HuggingFaceToken: firstNonEmpty(os.Getenv("HUGGINGFACE_TOKEN"), viper.GetString("recon.huggingface.token")),
+ KaggleUser: firstNonEmpty(os.Getenv("KAGGLE_USERNAME"), viper.GetString("recon.kaggle.username")),
+ KaggleKey: firstNonEmpty(os.Getenv("KAGGLE_KEY"), viper.GetString("recon.kaggle.key")),
+ }
+ sources.RegisterAll(e, cfg)
+ return e
+ }
+
+ func firstNonEmpty(a, b string) string { if a != "" { return a }; return b }
+ ```
+ - Preserve existing reconFullCmd / reconListCmd behavior.
+
+
+ cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestIntegration -v -timeout 60s && go build ./... && go run . recon list | sort
+
+
+ Integration test passes with at least one Finding per SourceType across all 10
+ sources. `keyhunter recon list` prints all 10 source names plus "example".
+
+
+
+
+
+
+- `go build ./...`
+- `go vet ./...`
+- `go test ./pkg/recon/sources/... -v -timeout 60s`
+- `go test ./pkg/recon/... -timeout 60s` (ensure no regression in Phase 9 recon tests)
+- `go run . recon list` prints all 10 new source names
+
+
+
+All Phase 10 code hosting sources registered via sources.RegisterAll, wired into
+cmd/recon.go, and exercised end-to-end by an integration test hitting httptest
+fixtures for every source. Phase 10 requirements RECON-CODE-01..10 complete.
+
+
+