diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 1d6c98e..498bd84 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -215,7 +215,17 @@ Plans: 3. `keyhunter recon --sources=gist,bitbucket,codeberg` scans public gists, Bitbucket repos, and Codeberg/Gitea instances 4. `keyhunter recon --sources=replit,codesandbox,kaggle` scans public repls, sandboxes, and notebooks 5. All code hosting source findings are stored in the database with source attribution and deduplication -**Plans**: TBD +**Plans**: 9 plans +Plans: +- [ ] 10-01-PLAN.md — Shared HTTP client + provider-query generator + RegisterAll skeleton +- [ ] 10-02-PLAN.md — GitHubSource (RECON-CODE-01) +- [ ] 10-03-PLAN.md — GitLabSource (RECON-CODE-02) +- [ ] 10-04-PLAN.md — BitbucketSource + GistSource (RECON-CODE-03, RECON-CODE-04) +- [ ] 10-05-PLAN.md — CodebergSource/Gitea (RECON-CODE-05) +- [ ] 10-06-PLAN.md — HuggingFaceSource (RECON-CODE-08) +- [ ] 10-07-PLAN.md — Replit + CodeSandbox + Sandboxes scrapers (RECON-CODE-06, RECON-CODE-07, RECON-CODE-10) +- [ ] 10-08-PLAN.md — KaggleSource (RECON-CODE-09) +- [ ] 10-09-PLAN.md — RegisterAll wiring + CLI integration + end-to-end test ### Phase 11: OSINT Search & Paste **Goal**: Users can run automated search engine dorking against Google, Bing, DuckDuckGo, Yandex, and Brave, and scan 15+ paste site aggregations for leaked API keys diff --git a/.planning/phases/10-osint-code-hosting/10-01-PLAN.md b/.planning/phases/10-osint-code-hosting/10-01-PLAN.md new file mode 100644 index 0000000..d7d627a --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-01-PLAN.md @@ -0,0 +1,331 @@ +--- +phase: 10-osint-code-hosting +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - pkg/recon/sources/doc.go + - pkg/recon/sources/httpclient.go + - pkg/recon/sources/httpclient_test.go + - pkg/recon/sources/queries.go + - pkg/recon/sources/queries_test.go + - pkg/recon/sources/register.go +autonomous: true +requirements: [] +must_haves: + truths: + - "Shared retry HTTP client honors ctx cancellation and Retry-After on 429/403" + - "Provider registry drives per-source query templates (no hardcoded literals)" + - "Empty source registry compiles and exposes RegisterAll(engine, cfg)" + artifacts: + - path: "pkg/recon/sources/httpclient.go" + provides: "Retrying *http.Client with context + Retry-After handling" + - path: "pkg/recon/sources/queries.go" + provides: "BuildQueries(registry, sourceName) []string generator" + - path: "pkg/recon/sources/register.go" + provides: "RegisterAll(engine *recon.Engine, cfg SourcesConfig) bootstrap" + key_links: + - from: "pkg/recon/sources/httpclient.go" + to: "net/http + context + golang.org/x/time/rate" + via: "DoWithRetry(ctx, req, limiter) (*http.Response, error)" + pattern: "DoWithRetry" + - from: "pkg/recon/sources/queries.go" + to: "pkg/providers.Registry" + via: "BuildQueries iterates reg.List() and formats provider keywords" + pattern: "BuildQueries" +--- + + +Establish the shared foundation for all Phase 10 code hosting sources: a retry-aware HTTP +client wrapper, a provider→query template generator driven by the provider registry, and +an empty RegisterAll bootstrap that Plan 10-09 will fill in. No individual source is +implemented here — this plan exists so Wave 2 plans (10-02..10-08) can run in parallel +without fighting over shared helpers. + +Purpose: Deduplicate retry/rate-limit/backoff logic across 10 sources; centralize query +generation so providers added later automatically flow to every source. +Output: Compilable `pkg/recon/sources` package skeleton with tested helpers. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/STATE.md +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@pkg/recon/source.go +@pkg/recon/limiter.go +@pkg/dorks/github.go +@pkg/providers/registry.go + + +From pkg/recon/source.go: +```go +type ReconSource interface { + Name() string + RateLimit() rate.Limit + Burst() int + RespectsRobots() bool + Enabled(cfg Config) bool + Sweep(ctx context.Context, query string, out chan<- Finding) error +} +type Finding = engine.Finding +type Config struct { Stealth, RespectRobots bool; EnabledSources []string; Query string } +``` + +From pkg/recon/limiter.go: +```go +type LimiterRegistry struct { ... } +func NewLimiterRegistry() *LimiterRegistry +func (lr *LimiterRegistry) Wait(ctx, name, r, burst, stealth) error +``` + +From pkg/providers/registry.go: +```go +func (r *Registry) List() []Provider +// Provider has: Name string, Keywords []string, Patterns []Pattern, Tier int +``` + +From pkg/engine/finding.go: +```go +type Finding struct { + ProviderName, KeyValue, KeyMasked, Confidence, Source, SourceType string + LineNumber int; Offset int64; DetectedAt time.Time + Verified bool; VerifyStatus string; ... +} +``` + + + + + + + Task 1: Shared retry HTTP client helper + pkg/recon/sources/doc.go, pkg/recon/sources/httpclient.go, pkg/recon/sources/httpclient_test.go + + - Test A: 200 OK returns response unchanged, body readable + - Test B: 429 with Retry-After:1 triggers one retry then succeeds (verify via httptest counter) + - Test C: 403 with Retry-After triggers retry + - Test D: 401 returns ErrUnauthorized immediately, no retry + - Test E: Ctx cancellation during retry sleep returns ctx.Err() + - Test F: MaxRetries exhausted returns wrapped last-status error + + + Create `pkg/recon/sources/doc.go` with the package comment: "Package sources hosts per-OSINT-source ReconSource implementations for Phase 10 code hosting (GitHub, GitLab, Bitbucket, Gist, Codeberg, HuggingFace, Kaggle, Replit, CodeSandbox, sandboxes). Each source implements pkg/recon.ReconSource." + + Create `pkg/recon/sources/httpclient.go` exporting: + ```go + package sources + + import ( + "context" + "errors" + "fmt" + "net/http" + "strconv" + "time" + ) + + // ErrUnauthorized is returned when an API rejects credentials (401). + var ErrUnauthorized = errors.New("sources: unauthorized (check credentials)") + + // Client is the shared retry wrapper every Phase 10 source uses. + type Client struct { + HTTP *http.Client + MaxRetries int // default 2 + UserAgent string // default "keyhunter-recon/1.0" + } + + // NewClient returns a Client with a 30s timeout and 2 retries. + func NewClient() *Client { + return &Client{HTTP: &http.Client{Timeout: 30 * time.Second}, MaxRetries: 2, UserAgent: "keyhunter-recon/1.0"} + } + + // Do executes req with retries on 429/403/5xx honoring Retry-After. + // 401 returns ErrUnauthorized wrapped with the response body. + // Ctx cancellation is honored during sleeps. + func (c *Client) Do(ctx context.Context, req *http.Request) (*http.Response, error) { + if req.Header.Get("User-Agent") == "" { req.Header.Set("User-Agent", c.UserAgent) } + var last *http.Response + for attempt := 0; attempt <= c.MaxRetries; attempt++ { + r, err := c.HTTP.Do(req.WithContext(ctx)) + if err != nil { return nil, fmt.Errorf("sources http: %w", err) } + if r.StatusCode == http.StatusOK { return r, nil } + if r.StatusCode == http.StatusUnauthorized { + body := readBody(r) + return nil, fmt.Errorf("%w: %s", ErrUnauthorized, body) + } + retriable := r.StatusCode == 429 || r.StatusCode == 403 || r.StatusCode >= 500 + if !retriable || attempt == c.MaxRetries { + body := readBody(r) + return nil, fmt.Errorf("sources http %d: %s", r.StatusCode, body) + } + sleep := ParseRetryAfter(r.Header.Get("Retry-After")) + r.Body.Close() + last = r + select { + case <-time.After(sleep): + case <-ctx.Done(): return nil, ctx.Err() + } + } + _ = last + return nil, fmt.Errorf("sources http: retries exhausted") + } + + // ParseRetryAfter decodes integer-seconds Retry-After, defaulting to 1s. + func ParseRetryAfter(v string) time.Duration { ... } + // readBody reads up to 4KB of the body and closes it. + func readBody(r *http.Response) string { ... } + ``` + + Create `pkg/recon/sources/httpclient_test.go` using `net/http/httptest`: + - Table-driven tests for each behavior above. Use an atomic counter to verify + retry attempt counts. Use `httptest.NewServer` with a handler that switches on + a request counter. + - For ctx cancellation test: set Retry-After: 10, cancel ctx inside 100ms, assert + ctx.Err() returned within 500ms. + + Do NOT build a LimiterRegistry wrapper here — each source calls its own LimiterRegistry.Wait + before calling Client.Do. Keeps Client single-purpose (retry only). + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestClient -v -timeout 30s + + + All behaviors covered; Client.Do retries on 429/403/5xx honoring Retry-After; 401 + returns ErrUnauthorized immediately; ctx cancellation respected; tests green. + + + + + Task 2: Provider-driven query generator + RegisterAll skeleton + pkg/recon/sources/queries.go, pkg/recon/sources/queries_test.go, pkg/recon/sources/register.go + + - Test A: BuildQueries(reg, "github") returns one query per (provider, keyword) tuple formatted as GitHub search syntax, e.g. `"sk-proj-" in:file` + - Test B: BuildQueries(reg, "gitlab") returns queries formatted for GitLab search syntax (raw keyword, no `in:file`) + - Test C: BuildQueries(reg, "huggingface") returns bare keyword queries + - Test D: Unknown source name returns bare keyword queries (safe default) + - Test E: Providers with empty Keywords slice are skipped + - Test F: Keyword dedup — if two providers share keyword, emit once per source + - Test G: RegisterAll(nil, cfg) is a no-op that does not panic; RegisterAll with empty cfg does not panic + + + Create `pkg/recon/sources/queries.go`: + ```go + package sources + + import ( + "fmt" + "sort" + + "github.com/salvacybersec/keyhunter/pkg/providers" + ) + + // BuildQueries produces the search-string list a source should iterate for a + // given provider registry. Each keyword is formatted per source-specific syntax. + // Result is deterministic (sorted) for reproducible tests. + func BuildQueries(reg *providers.Registry, source string) []string { + if reg == nil { return nil } + seen := make(map[string]struct{}) + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { continue } + seen[k] = struct{}{} + } + } + keywords := make([]string, 0, len(seen)) + for k := range seen { keywords = append(keywords, k) } + sort.Strings(keywords) + + out := make([]string, 0, len(keywords)) + for _, k := range keywords { + out = append(out, formatQuery(source, k)) + } + return out + } + + func formatQuery(source, keyword string) string { + switch source { + case "github", "gist": + return fmt.Sprintf("%q in:file", keyword) + case "gitlab": + return keyword // GitLab code search doesn't support in:file qualifier + case "bitbucket": + return keyword + case "codeberg": + return keyword + default: + return keyword + } + } + ``` + + Create `pkg/recon/sources/queries_test.go` using `providers.NewRegistryFromProviders` + with two synthetic providers (shared keyword to test dedup). + + Create `pkg/recon/sources/register.go`: + ```go + package sources + + import ( + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" + ) + + // SourcesConfig carries per-source credentials read from viper/env by cmd/recon.go. + // Plan 10-09 fleshes this out; for now it is a placeholder struct so downstream + // plans can depend on its shape. + type SourcesConfig struct { + GitHubToken string + GitLabToken string + BitbucketToken string + HuggingFaceToken string + KaggleUser string + KaggleKey string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + } + + // RegisterAll registers every Phase 10 code-hosting source on engine. + // Wave 2 plans append their source constructors here via additional + // registerXxx helpers in this file. Plan 10-09 writes the final list. + func RegisterAll(engine *recon.Engine, cfg SourcesConfig) { + if engine == nil { return } + // Populated by Plan 10-09 (after Wave 2 lands individual source files). + } + ``` + + Do NOT wire this into cmd/recon.go yet — Plan 10-09 handles CLI integration after + every source exists. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestBuildQueries|TestRegisterAll" -v -timeout 30s && go build ./... + + + BuildQueries is deterministic, dedups keywords, formats per-source syntax. + RegisterAll compiles as a no-op stub. Package builds with zero source + implementations — ready for Wave 2 plans to add files in parallel. + + + + + + +- `go build ./...` succeeds +- `go test ./pkg/recon/sources/...` passes +- `go vet ./pkg/recon/sources/...` clean + + + +pkg/recon/sources package exists with httpclient.go, queries.go, register.go, doc.go +and all tests green. No source implementations present yet — that is Wave 2. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-02-PLAN.md b/.planning/phases/10-osint-code-hosting/10-02-PLAN.md new file mode 100644 index 0000000..c9dff5f --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-02-PLAN.md @@ -0,0 +1,238 @@ +--- +phase: 10-osint-code-hosting +plan: 02 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/github.go + - pkg/recon/sources/github_test.go +autonomous: true +requirements: [RECON-CODE-01] +must_haves: + truths: + - "GitHubSource.Sweep runs BuildQueries against GitHub /search/code and emits engine.Finding per match" + - "GitHubSource is disabled when cfg token is empty (logs and returns nil, no error)" + - "GitHubSource honors ctx cancellation mid-query and rate limiter tokens before each request" + - "Each Finding has SourceType=\"recon:github\" and Source = html_url" + artifacts: + - path: "pkg/recon/sources/github.go" + provides: "GitHubSource implementing recon.ReconSource" + contains: "func (s *GitHubSource) Sweep" + - path: "pkg/recon/sources/github_test.go" + provides: "httptest-driven unit tests" + key_links: + - from: "pkg/recon/sources/github.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do" + pattern: "c\\.client\\.Do" + - from: "pkg/recon/sources/github.go" + to: "pkg/recon/sources/queries.go" + via: "BuildQueries(reg, \"github\")" + pattern: "BuildQueries" +--- + + +Implement GitHubSource — the first real Phase 10 recon source. Refactors logic from +pkg/dorks/github.go (Phase 8's GitHubExecutor) into a recon.ReconSource. Emits +engine.Finding entries for every /search/code match, driven by provider keyword +queries from pkg/recon/sources/queries.go. + +Purpose: RECON-CODE-01 — users can scan GitHub public code for leaked LLM keys. +Output: pkg/recon/sources/github.go + green tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/limiter.go +@pkg/dorks/github.go +@pkg/recon/sources/httpclient.go +@pkg/recon/sources/queries.go +@pkg/recon/sources/register.go + + +Reference pkg/dorks/github.go for the response struct shapes (ghSearchResponse, +ghCodeItem, ghRepository, ghTextMatchEntry) — copy or alias them. GitHub Code Search +endpoint: GET /search/code?q=&per_page= with headers: +- Accept: application/vnd.github.v3.text-match+json +- Authorization: Bearer +- User-Agent: keyhunter-recon + +Rate limit: 30 req/min authenticated → rate.Every(2*time.Second), burst 1. + + + + + + + Task 1: GitHubSource implementation + tests + pkg/recon/sources/github.go, pkg/recon/sources/github_test.go + + - Test A: Enabled returns false when token empty; true when token set + - Test B: Sweep with empty token returns nil (no error, logs disabled) + - Test C: Sweep against httptest server decodes a 2-item response, emits 2 Findings on channel with SourceType="recon:github" and Source=html_url + - Test D: ProviderName is derived by matching query keyword back to provider via the registry (pass in synthetic registry) + - Test E: Ctx cancellation before first request returns ctx.Err() + - Test F: 401 from server returns wrapped ErrUnauthorized + - Test G: Multiple queries (from BuildQueries) iterate in sorted order + + + Create `pkg/recon/sources/github.go`: + ```go + package sources + + import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" + ) + + // GitHubSource implements recon.ReconSource against GitHub Code Search. + // RECON-CODE-01. + type GitHubSource struct { + Token string + BaseURL string // default https://api.github.com, overridable for tests + Registry *providers.Registry + Limiters *recon.LimiterRegistry + client *Client + } + + // NewGitHubSource constructs a source. If client is nil, NewClient() is used. + func NewGitHubSource(token string, reg *providers.Registry, lim *recon.LimiterRegistry) *GitHubSource { + return &GitHubSource{Token: token, BaseURL: "https://api.github.com", Registry: reg, Limiters: lim, client: NewClient()} + } + + func (s *GitHubSource) Name() string { return "github" } + func (s *GitHubSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } + func (s *GitHubSource) Burst() int { return 1 } + func (s *GitHubSource) RespectsRobots() bool { return false } + func (s *GitHubSource) Enabled(_ recon.Config) bool { return s.Token != "" } + + func (s *GitHubSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + if s.Token == "" { return nil } + base := s.BaseURL + if base == "" { base = "https://api.github.com" } + + queries := BuildQueries(s.Registry, "github") + kwToProvider := keywordIndex(s.Registry) + + for _, q := range queries { + if err := ctx.Err(); err != nil { return err } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } + } + endpoint := fmt.Sprintf("%s/search/code?q=%s&per_page=30", base, url.QueryEscape(q)) + req, _ := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + req.Header.Set("Accept", "application/vnd.github.v3.text-match+json") + req.Header.Set("Authorization", "Bearer "+s.Token) + + resp, err := s.client.Do(ctx, req) + if err != nil { + if errors.Is(err, ErrUnauthorized) { return err } + // Other errors: log-and-continue per CONTEXT (sources downgrade, not abort) + continue + } + var parsed ghSearchResponse + _ = json.NewDecoder(resp.Body).Decode(&parsed) + resp.Body.Close() + + provName := kwToProvider[extractKeyword(q)] + for _, it := range parsed.Items { + snippet := "" + if len(it.TextMatches) > 0 { snippet = it.TextMatches[0].Fragment } + f := recon.Finding{ + ProviderName: provName, + KeyMasked: "", + Confidence: "low", + Source: it.HTMLURL, + SourceType: "recon:github", + DetectedAt: time.Now(), + } + _ = snippet // reserved for future content scan pass + select { + case out <- f: + case <-ctx.Done(): return ctx.Err() + } + } + } + return nil + } + + // Response structs mirror pkg/dorks/github.go (kept private to this file + // to avoid cross-package coupling between dorks and recon/sources). + type ghSearchResponse struct { Items []ghCodeItem `json:"items"` } + type ghCodeItem struct { + HTMLURL string `json:"html_url"` + Repository ghRepository `json:"repository"` + TextMatches []ghTextMatchEntry `json:"text_matches"` + } + type ghRepository struct { FullName string `json:"full_name"` } + type ghTextMatchEntry struct { Fragment string `json:"fragment"` } + + // keywordIndex maps keyword -> provider name using the registry. + func keywordIndex(reg *providers.Registry) map[string]string { + m := make(map[string]string) + if reg == nil { return m } + for _, p := range reg.List() { + for _, k := range p.Keywords { m[k] = p.Name } + } + return m + } + + // extractKeyword parses the provider keyword out of a BuildQueries output. + // For github it's `"keyword" in:file`; for bare formats it's the whole string. + func extractKeyword(q string) string { ... strip quotes, trim ` in:file` suffix ... } + ``` + + Create `pkg/recon/sources/github_test.go`: + - Use `providers.NewRegistryFromProviders` with 2 synthetic providers (openai/sk-proj-, anthropic/sk-ant-) + - Spin up `httptest.NewServer` that inspects `r.URL.Query().Get("q")` and returns + a JSON body with two items whose html_url encodes the query + - Assert 2 findings per query received on the channel within 2s using select/time.After + - Separate test for empty token: NewGitHubSource("", reg, lim).Sweep returns nil immediately + - Separate test for 401: server returns 401 → Sweep returns error wrapping ErrUnauthorized + - Cancel-test: cancel ctx before Sweep call; assert ctx.Err() returned + + Leave GitHubSource unregistered (Plan 10-09 adds it to RegisterAll). + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGitHub -v -timeout 30s + + + GitHubSource satisfies recon.ReconSource (compile-time assert via `var _ recon.ReconSource = (*GitHubSource)(nil)`), + tests green, covers happy path + empty token + 401 + cancellation. + + + + + + +- `go build ./...` +- `go test ./pkg/recon/sources/ -run TestGitHub -v` +- `go vet ./pkg/recon/sources/...` + + + +RECON-CODE-01 satisfied: GitHubSource queries /search/code using provider-registry-driven +keywords and emits engine.Finding. Ready for registration in Plan 10-09. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-02-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-03-PLAN.md b/.planning/phases/10-osint-code-hosting/10-03-PLAN.md new file mode 100644 index 0000000..f6da797 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-03-PLAN.md @@ -0,0 +1,120 @@ +--- +phase: 10-osint-code-hosting +plan: 03 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/gitlab.go + - pkg/recon/sources/gitlab_test.go +autonomous: true +requirements: [RECON-CODE-02] +must_haves: + truths: + - "GitLabSource.Sweep queries GitLab /api/v4/search?scope=blobs and emits Findings" + - "Disabled when token empty; enabled otherwise" + - "Findings have SourceType=\"recon:gitlab\" and Source = web_url of blob" + artifacts: + - path: "pkg/recon/sources/gitlab.go" + provides: "GitLabSource implementing recon.ReconSource" + - path: "pkg/recon/sources/gitlab_test.go" + provides: "httptest tests" + key_links: + - from: "pkg/recon/sources/gitlab.go" + to: "pkg/recon/sources/httpclient.go" + via: "c.client.Do(ctx, req)" + pattern: "client\\.Do" +--- + + +Implement GitLabSource against GitLab's Search API (/api/v4/search?scope=blobs). +Honors PRIVATE-TOKEN header auth, 2000 req/min rate limit. + +Purpose: RECON-CODE-02. +Output: pkg/recon/sources/gitlab.go + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/sources/httpclient.go +@pkg/recon/sources/queries.go + + +GitLab Search API (docs: https://docs.gitlab.com/ee/api/search.html): + GET /api/v4/search?scope=blobs&search=&per_page=20 + Header: PRIVATE-TOKEN: +Response (array of blob objects): + [{ "basename": "...", "data": "matched snippet", "path": "...", "project_id": 123, + "ref": "main", "startline": 42 }, ...] +Project web_url must be constructed from project_id → fetch /api/v4/projects/ (or +just use basename+path with a placeholder Source — keep it minimal: Source = +"https://gitlab.com/projects//-/blob//"). + +Rate limit: 2000 req/min → rate.Every(30 * time.Millisecond) ≈ 2000/min, burst 5. + + + + + + + Task 1: GitLabSource implementation + tests + pkg/recon/sources/gitlab.go, pkg/recon/sources/gitlab_test.go + + - Test A: Enabled false when token empty + - Test B: Sweep queries /api/v4/search with scope=blobs, PRIVATE-TOKEN header set + - Test C: Decodes array response, emits one Finding per blob with Source containing project_id + path + ref + - Test D: 401 returns wrapped ErrUnauthorized + - Test E: Ctx cancellation respected + - Test F: Empty token → Sweep returns nil with no calls + + + Create `pkg/recon/sources/gitlab.go` with struct `GitLabSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }`. + + Default BaseURL: `https://gitlab.com`. + Name: "gitlab". RateLimit: `rate.Every(30 * time.Millisecond)`. Burst: 5. RespectsRobots: false. + + Sweep loop: + - For each query from BuildQueries(reg, "gitlab"): + - Build `base + /api/v4/search?scope=blobs&search=&per_page=20` + - Set header `PRIVATE-TOKEN: ` + - limiters.Wait, then client.Do + - Decode `[]glBlob` where glBlob has ProjectID int, Path, Ref, Data, Startline + - Emit Finding with Source = fmt.Sprintf("%s/projects/%d/-/blob/%s/%s", base, b.ProjectID, b.Ref, b.Path), SourceType="recon:gitlab", Confidence="low", ProviderName derived via keywordIndex(reg) + - Respect ctx.Done on send + + Add compile-time assert: `var _ recon.ReconSource = (*GitLabSource)(nil)`. + + Create `pkg/recon/sources/gitlab_test.go` with httptest server returning a JSON + array of two blob objects. Assert both Findings received, Source URLs contain + project IDs, ctx cancellation test, 401 test, empty-token test. Use synthetic + registry with 2 providers. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGitLab -v -timeout 30s + + + GitLabSource compiles, implements ReconSource, all test behaviors covered. + + + + + + +- `go build ./...` +- `go test ./pkg/recon/sources/ -run TestGitLab -v` + + + +RECON-CODE-02 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-03-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-04-PLAN.md b/.planning/phases/10-osint-code-hosting/10-04-PLAN.md new file mode 100644 index 0000000..fd658f7 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-04-PLAN.md @@ -0,0 +1,163 @@ +--- +phase: 10-osint-code-hosting +plan: 04 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/bitbucket.go + - pkg/recon/sources/bitbucket_test.go + - pkg/recon/sources/gist.go + - pkg/recon/sources/gist_test.go +autonomous: true +requirements: [RECON-CODE-03, RECON-CODE-04] +must_haves: + truths: + - "BitbucketSource queries Bitbucket 2.0 code search API and emits Findings" + - "GistSource queries GitHub Gist search (re-uses GitHub token) and emits Findings" + - "Both disabled when respective credentials are empty" + artifacts: + - path: "pkg/recon/sources/bitbucket.go" + provides: "BitbucketSource implementing recon.ReconSource" + - path: "pkg/recon/sources/gist.go" + provides: "GistSource implementing recon.ReconSource" + key_links: + - from: "pkg/recon/sources/gist.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do with Bearer " + pattern: "client\\.Do" + - from: "pkg/recon/sources/bitbucket.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do" + pattern: "client\\.Do" +--- + + +Implement BitbucketSource (RECON-CODE-03) and GistSource (RECON-CODE-04). Grouped +because both are small API integrations with similar shapes (JSON array/values, +per-item URL, token gating). + +Purpose: RECON-CODE-03, RECON-CODE-04. +Output: Two new ReconSource implementations + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/sources/httpclient.go +@pkg/recon/sources/queries.go + + +Bitbucket 2.0 search (docs: https://developer.atlassian.com/cloud/bitbucket/rest/api-group-search/): + GET /2.0/workspaces/{workspace}/search/code?search_query= + Auth: Bearer (app password or OAuth) + Response: { "values": [{ "content_match_count": N, "file": {"path":"","commit":{...}}, "page_url": "..." }] } + Note: Requires a workspace param — make it configurable via SourcesConfig.BitbucketWorkspace; + if unset, source is disabled. Rate: 1000/hour → rate.Every(3.6 * time.Second), burst 1. + +GitHub Gist search: GitHub does not expose a dedicated /search/gists endpoint that +searches gist contents. Use the /gists/public endpoint + client-side filtering as +fallback: GET /gists/public?per_page=100 returns public gists; for each gist, fetch +/gists/{id} and scan file contents for keyword matches. Keep implementation minimal: +just enumerate the first page, match against keyword list, emit Findings with +Source = gist.html_url. Auth: Bearer . Rate: 30/min → rate.Every(2s). + + + + + + + Task 1: BitbucketSource + tests + pkg/recon/sources/bitbucket.go, pkg/recon/sources/bitbucket_test.go + + - Test A: Enabled false when token OR workspace empty + - Test B: Enabled true when both set + - Test C: Sweep queries /2.0/workspaces/{ws}/search/code with Bearer header + - Test D: Decodes `{values:[{file:{path,commit:{...}},page_url:"..."}]}` and emits Finding with Source=page_url, SourceType="recon:bitbucket" + - Test E: 401 → ErrUnauthorized + - Test F: Ctx cancellation + + + Create `pkg/recon/sources/bitbucket.go`: + - Struct `BitbucketSource { Token, Workspace, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Default BaseURL: `https://api.bitbucket.org` + - Name "bitbucket", RateLimit rate.Every(3600*time.Millisecond), Burst 1, RespectsRobots false + - Enabled = s.Token != "" && s.Workspace != "" + - Sweep: for each query in BuildQueries(reg, "bitbucket"), limiters.Wait, issue + GET request, decode into struct with `Values []struct{ PageURL string "json:page_url"; File struct{ Path string } "json:file" }`, emit Findings + - Compile-time assert `var _ recon.ReconSource = (*BitbucketSource)(nil)` + + Create `pkg/recon/sources/bitbucket_test.go` with httptest server, synthetic + registry, assertions on URL path `/2.0/workspaces/testws/search/code`, Bearer + header, and emitted Findings. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestBitbucket -v -timeout 30s + + + BitbucketSource passes all tests, implements ReconSource. + + + + + Task 2: GistSource + tests + pkg/recon/sources/gist.go, pkg/recon/sources/gist_test.go + + - Test A: Enabled false when GitHub token empty + - Test B: Sweep fetches /gists/public?per_page=100 with Bearer auth + - Test C: For each gist, iterates files map; if any file.content contains a provider keyword, emits one Finding with Source=gist.html_url + - Test D: Ctx cancellation + - Test E: 401 → ErrUnauthorized + - Test F: Gist without matching keyword → no Finding emitted + + + Create `pkg/recon/sources/gist.go`: + - Struct `GistSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - BaseURL default `https://api.github.com` + - Name "gist", RateLimit rate.Every(2*time.Second), Burst 1, RespectsRobots false + - Enabled = s.Token != "" + - Sweep flow: + 1. Build keyword list from registry (flat set) + 2. GET /gists/public?per_page=100 with Bearer header + 3. Decode `[]struct{ HTMLURL string "json:html_url"; Files map[string]struct{ Filename, RawURL string "json:raw_url" } "json:files" }` + 4. For each gist, for each file, if we can match without fetching raw content, + skip raw fetch (keep Phase 10 minimal). Fallback: fetch file.RawURL and + scan content for any keyword from the set; on hit, emit one Finding + per gist (not per file) with ProviderName from matched keyword. + 5. Respect limiters.Wait before each outbound request (gist list + each raw fetch) + - Compile-time assert `var _ recon.ReconSource = (*GistSource)(nil)` + + Create `pkg/recon/sources/gist_test.go`: + - httptest server with two routes: `/gists/public` returns 2 gists each with 1 file, raw_url pointing to same server `/raw/`; `/raw/` returns content containing "sk-proj-" for one and an unrelated string for the other + - Assert exactly 1 Finding emitted, Source matches the gist's html_url + - 401 test, ctx cancellation test, empty-token test + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestGist -v -timeout 30s + + + GistSource emits Findings only when a known provider keyword is present in a gist + file body; all tests green. + + + + + + +- `go build ./...` +- `go test ./pkg/recon/sources/ -run "TestBitbucket|TestGist" -v` + + + +RECON-CODE-03 and RECON-CODE-04 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-04-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-05-PLAN.md b/.planning/phases/10-osint-code-hosting/10-05-PLAN.md new file mode 100644 index 0000000..5eb92da --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-05-PLAN.md @@ -0,0 +1,113 @@ +--- +phase: 10-osint-code-hosting +plan: 05 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/codeberg.go + - pkg/recon/sources/codeberg_test.go +autonomous: true +requirements: [RECON-CODE-05] +must_haves: + truths: + - "CodebergSource queries Gitea REST API /api/v1/repos/search and /api/v1/repos/.../contents for keyword matches" + - "No token required for public repos (but optional token honored if provided)" + - "Findings tagged SourceType=\"recon:codeberg\"" + artifacts: + - path: "pkg/recon/sources/codeberg.go" + provides: "CodebergSource implementing recon.ReconSource (Gitea-compatible)" + key_links: + - from: "pkg/recon/sources/codeberg.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do" + pattern: "client\\.Do" +--- + + +Implement CodebergSource targeting Gitea's REST API. Codeberg.org runs Gitea, so the +same code works for any Gitea instance by configuring BaseURL. Public repos do not +require auth, but a token can be passed to raise rate limits. + +Purpose: RECON-CODE-05. +Output: pkg/recon/sources/codeberg.go + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/sources/httpclient.go + + +Gitea API (v1, docs: https://docs.gitea.com/api): + GET /api/v1/repos/search?q=&limit=50 + Response: { "data": [{ "full_name": "...", "html_url": "..." }], "ok": true } + Header (optional): Authorization: token + +For this phase we only use /repos/search — matching on repo metadata (name/description). +Full-content code search is not uniformly available across Gitea instances (Codeberg +has gitea "code search" enabled via Bleve index; we rely on it when present via +GET /api/v1/repos/search?q=... which returns repos only. For content matching we +fall back to searching each provider keyword as a query string and emitting Findings +keyed to the repo html_url). + +Rate: public unauth 60 req/hour → rate.Every(60 * time.Second). Burst 1. +With token: 1000/hour → rate.Every(3600 * time.Millisecond). Detect via token presence. + + + + + + + Task 1: CodebergSource + tests + pkg/recon/sources/codeberg.go, pkg/recon/sources/codeberg_test.go + + - Test A: Enabled always true (public API, token optional) + - Test B: Sweep queries /api/v1/repos/search?q=&limit=50 for each BuildQueries entry + - Test C: Decodes `{data:[{full_name,html_url}]}` and emits Finding with Source=html_url, SourceType="recon:codeberg", ProviderName from keywordIndex + - Test D: With token set, Authorization header is "token "; without token, header absent + - Test E: Ctx cancellation + - Test F: Unauth rate limit applied when Token empty (verified via RateLimit() return) + + + Create `pkg/recon/sources/codeberg.go`: + - Struct `CodebergSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Default BaseURL: `https://codeberg.org` + - Name "codeberg", RespectsRobots false + - RateLimit(): if Token == "" return rate.Every(60*time.Second), else rate.Every(3600*time.Millisecond) + - Burst 1 + - Enabled always returns true + - Sweep: for each query, build `base + /api/v1/repos/search?q=&limit=50`, set Authorization only when Token set, client.Do, decode, emit Findings + - Compile-time assert + + Create `pkg/recon/sources/codeberg_test.go` with httptest server returning a + `{data:[...],ok:true}` body. Two test cases: with token (header present) and + without (header absent — use a flag inside the handler to capture). + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestCodeberg -v -timeout 30s + + + CodebergSource implements ReconSource, tests green for both auth modes. + + + + + + +- `go test ./pkg/recon/sources/ -run TestCodeberg -v` + + + +RECON-CODE-05 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-06-PLAN.md b/.planning/phases/10-osint-code-hosting/10-06-PLAN.md new file mode 100644 index 0000000..f26c010 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-06-PLAN.md @@ -0,0 +1,108 @@ +--- +phase: 10-osint-code-hosting +plan: 06 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/huggingface.go + - pkg/recon/sources/huggingface_test.go +autonomous: true +requirements: [RECON-CODE-08] +must_haves: + truths: + - "HuggingFaceSource queries /api/spaces and /api/models search endpoints" + - "Token is optional — anonymous requests allowed at lower rate limit" + - "Findings have SourceType=\"recon:huggingface\" and Source = full HF URL" + artifacts: + - path: "pkg/recon/sources/huggingface.go" + provides: "HuggingFaceSource implementing recon.ReconSource" + key_links: + - from: "pkg/recon/sources/huggingface.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do" + pattern: "client\\.Do" +--- + + +Implement HuggingFaceSource scanning both Spaces and model repos via the HF Hub API. +Token optional; unauthenticated requests work but are rate-limited harder. + +Purpose: RECON-CODE-08. +Output: pkg/recon/sources/huggingface.go + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/sources/httpclient.go + + +HuggingFace Hub API: + GET https://huggingface.co/api/spaces?search=&limit=50 + GET https://huggingface.co/api/models?search=&limit=50 + Response (either): array of { "id": "owner/name", "modelId"|"spaceId": "owner/name" } + Optional auth: Authorization: Bearer + +URL derivation: Source = "https://huggingface.co/spaces/" or ".../" for models. + +Rate: 1000/hour authenticated → rate.Every(3600*time.Millisecond); unauth: rate.Every(10*time.Second), burst 1. + + + + + + + Task 1: HuggingFaceSource + tests + pkg/recon/sources/huggingface.go, pkg/recon/sources/huggingface_test.go + + - Test A: Enabled always true (token optional) + - Test B: Sweep hits both /api/spaces and /api/models endpoints for each query + - Test C: Decodes array of {id} and emits Findings with Source prefixed by "https://huggingface.co/spaces/" or "https://huggingface.co/" for models, SourceType="recon:huggingface" + - Test D: Authorization header present when token set, absent when empty + - Test E: Ctx cancellation respected + - Test F: RateLimit returns slower rate when token empty + + + Create `pkg/recon/sources/huggingface.go`: + - Struct `HuggingFaceSource { Token, BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Default BaseURL: `https://huggingface.co` + - Name "huggingface", RespectsRobots false, Burst 1 + - RateLimit: token-dependent (see interfaces) + - Enabled always true + - Sweep: build keyword list, for each keyword iterate two endpoints + (`/api/spaces?search=&limit=50`, `/api/models?search=&limit=50`), emit + Findings. URL prefix differs per endpoint. + - Compile-time assert + + Create `pkg/recon/sources/huggingface_test.go` with httptest server that routes + both paths. Assert exact number of Findings (2 per keyword × number of keywords) + and URL prefixes. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestHuggingFace -v -timeout 30s + + + HuggingFaceSource passes tests covering both endpoints, token modes, cancellation. + + + + + + +- `go test ./pkg/recon/sources/ -run TestHuggingFace -v` + + + +RECON-CODE-08 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-07-PLAN.md b/.planning/phases/10-osint-code-hosting/10-07-PLAN.md new file mode 100644 index 0000000..5a21bc0 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-07-PLAN.md @@ -0,0 +1,191 @@ +--- +phase: 10-osint-code-hosting +plan: 07 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/replit.go + - pkg/recon/sources/replit_test.go + - pkg/recon/sources/codesandbox.go + - pkg/recon/sources/codesandbox_test.go + - pkg/recon/sources/sandboxes.go + - pkg/recon/sources/sandboxes_test.go +autonomous: true +requirements: [RECON-CODE-06, RECON-CODE-07, RECON-CODE-10] +must_haves: + truths: + - "ReplitSource scrapes replit.com search HTML and emits Findings tagged recon:replit" + - "CodeSandboxSource scrapes codesandbox.io search and emits Findings tagged recon:codesandbox" + - "SandboxesSource aggregates JSFiddle+CodePen+StackBlitz+Glitch+Observable+Gitpod with SourceType recon:sandboxes and sub-type in KeyMasked metadata slot" + - "All three RespectsRobots()==true and rate-limit conservatively (10/min)" + artifacts: + - path: "pkg/recon/sources/replit.go" + provides: "ReplitSource (scraper)" + - path: "pkg/recon/sources/codesandbox.go" + provides: "CodeSandboxSource (scraper)" + - path: "pkg/recon/sources/sandboxes.go" + provides: "SandboxesSource aggregator (JSFiddle, CodePen, StackBlitz, Glitch, Observable, Gitpod)" + key_links: + - from: "pkg/recon/sources/replit.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do on https://replit.com/search?q=..." + pattern: "client\\.Do" + - from: "pkg/recon/sources/sandboxes.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do on per-sandbox search URLs" + pattern: "client\\.Do" +--- + + +Implement three scraping-based sources for sandbox/IDE platforms without public +search APIs. All three honor robots.txt, use a conservative 10 req/min rate, and +emit Findings with best-effort HTML link extraction. + +Purpose: RECON-CODE-06 (Replit), RECON-CODE-07 (CodeSandbox), RECON-CODE-10 +(CodePen/JSFiddle/StackBlitz/Glitch/Observable/Gitpod aggregator). +Output: 3 new ReconSource implementations + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/robots.go +@pkg/recon/sources/httpclient.go + + +Scraping strategy (identical for all three sources in this plan): +1. Build per-provider keyword queries via BuildQueries (default format = bare keyword) +2. Fetch search URL via Client.Do (no auth headers) +3. Use a simple regex to extract result links from HTML (href="/@user/repl-name" + or href="/s/...") — use net/html parser for robustness +4. Emit one Finding per extracted link with SourceType="recon:" and Source=absolute URL +5. Return early on ctx cancellation + +Search URLs (approximations — confirm in action): +- Replit: https://replit.com/search?q=&type=repls +- CodeSandbox: https://codesandbox.io/search?query=&type=sandboxes +- CodePen: https://codepen.io/search/pens?q= +- JSFiddle: https://jsfiddle.net/api/search/?q= (returns JSON) +- StackBlitz: https://stackblitz.com/search?q= +- Glitch: https://glitch.com/api/search/projects?q= +- Observable: https://observablehq.com/search?query= +- Gitpod: https://www.gitpod.io/ (no public search; skip with log) + +All three sources set RespectsRobots()=true. Engine honors this via existing +pkg/recon/robots.go cache (caller coordinates RobotsCache check; not done here +because Phase 9 wires it at SweepAll level — if not, document TODO in code). + +Rate limits: all 10 req/min → rate.Every(6 * time.Second). Burst 1. + + + + + + + Task 1: ReplitSource + CodeSandboxSource (scrapers) + pkg/recon/sources/replit.go, pkg/recon/sources/replit_test.go, pkg/recon/sources/codesandbox.go, pkg/recon/sources/codesandbox_test.go + + - Test A (each): Sweep fetches search URL for each keyword via httptest server + - Test B: HTML parsing extracts anchor hrefs matching expected result patterns (use golang.org/x/net/html) + - Test C: Each extracted link emitted as Finding with Source=absolute URL, SourceType="recon:replit" or "recon:codesandbox" + - Test D: RespectsRobots returns true + - Test E: Ctx cancellation respected + - Test F: Enabled always returns true (no auth) + + + Add `golang.org/x/net/html` to go.mod if not already (`go get golang.org/x/net/html`). + + Create `pkg/recon/sources/replit.go`: + - Struct `ReplitSource { BaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Default BaseURL: `https://replit.com` + - Name "replit", RateLimit rate.Every(6*time.Second), Burst 1, RespectsRobots true, Enabled always true + - Sweep: for each keyword from BuildQueries, GET `{base}/search?q={keyword}&type=repls`, parse HTML with `html.Parse`, walk DOM collecting `` matching regex `^/@[^/]+/[^/]+$` (repl URLs), emit Finding per absolute URL + - Compile-time assert + + Create `pkg/recon/sources/replit_test.go`: + - httptest server returning fixed HTML snippet with 2 matching anchors + 1 non-matching + - Assert exactly 2 Findings with correct absolute URLs + + Create `pkg/recon/sources/codesandbox.go` with same shape but: + - Default BaseURL `https://codesandbox.io` + - Name "codesandbox" + - Search URL: `{base}/search?query=&type=sandboxes` + - Link regex: `^/s/[a-zA-Z0-9-]+$` or `/p/sandbox/...` + - SourceType "recon:codesandbox" + + Create `pkg/recon/sources/codesandbox_test.go` analogous to replit_test.go. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run "TestReplit|TestCodeSandbox" -v -timeout 30s + + + Both scrapers parse HTML, extract links, emit Findings; tests green. + + + + + Task 2: SandboxesSource aggregator (JSFiddle/CodePen/StackBlitz/Glitch/Observable/Gitpod) + pkg/recon/sources/sandboxes.go, pkg/recon/sources/sandboxes_test.go + + - Test A: Sweep iterates 6 sub-platforms for each keyword (via test override of Platforms slice) + - Test B: JSFiddle returns JSON → parsed into Findings (Source from result URLs) + - Test C: CodePen HTML → anchor extraction + - Test D: One failing sub-platform does NOT abort others (log-and-continue) + - Test E: SourceType = "recon:sandboxes"; sub-platform identifier goes into Confidence field or separate Platform map slot (use `KeyMasked` sentinel `platform=codepen` for now — pragmatic placeholder until a Metadata field exists) + - Test F: Ctx cancellation + + + Create `pkg/recon/sources/sandboxes.go`: + - Define `subPlatform` struct: `{ Name, SearchURL, ResultLinkRegex string; IsJSON bool; JSONItemsKey string }` + - Default Platforms: + ```go + var defaultPlatforms = []subPlatform{ + {Name: "codepen", SearchURL: "https://codepen.io/search/pens?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+`, IsJSON: false}, + {Name: "jsfiddle", SearchURL: "https://jsfiddle.net/api/search/?q=%s", IsJSON: true, JSONItemsKey: "results"}, + {Name: "stackblitz", SearchURL: "https://stackblitz.com/search?q=%s", ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+`, IsJSON: false}, + {Name: "glitch", SearchURL: "https://glitch.com/api/search/projects?q=%s", IsJSON: true, JSONItemsKey: "results"}, + {Name: "observable", SearchURL: "https://observablehq.com/search?query=%s", ResultLinkRegex: `^/@[^/]+/[^/]+`, IsJSON: false}, + } + ``` + (Gitpod omitted — no public search; document in comment.) + - Struct `SandboxesSource { Platforms []subPlatform; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Name "sandboxes", RateLimit rate.Every(6*time.Second), Burst 1, RespectsRobots true, Enabled always true + - Sweep: for each platform, for each keyword, fetch URL, parse either JSON or HTML, emit Findings with Source=absolute URL and KeyMasked="platform="+p.Name + - On any per-platform error, log (use stdlib log package) and continue + + Create `pkg/recon/sources/sandboxes_test.go`: + - Spin up a single httptest server; override Platforms slice with 2 platforms + pointing at `/codepen-search` (HTML) and `/jsfiddle-search` (JSON) + - Assert Findings from both platforms emitted + - Failure test: one platform returns 500 → log-and-continue, other still emits + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestSandboxes -v -timeout 30s + + + SandboxesSource iterates sub-platforms, handles HTML and JSON formats, tolerates + per-platform failure, emits Findings tagged with platform identifier. + + + + + + +- `go build ./...` +- `go test ./pkg/recon/sources/ -run "TestReplit|TestCodeSandbox|TestSandboxes" -v` + + + +RECON-CODE-06, RECON-CODE-07, RECON-CODE-10 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-07-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-08-PLAN.md b/.planning/phases/10-osint-code-hosting/10-08-PLAN.md new file mode 100644 index 0000000..aab06c4 --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-08-PLAN.md @@ -0,0 +1,109 @@ +--- +phase: 10-osint-code-hosting +plan: 08 +type: execute +wave: 2 +depends_on: [10-01] +files_modified: + - pkg/recon/sources/kaggle.go + - pkg/recon/sources/kaggle_test.go +autonomous: true +requirements: [RECON-CODE-09] +must_haves: + truths: + - "KaggleSource queries Kaggle public API /api/v1/kernels/list with Basic auth (username:key) and emits Findings" + - "Disabled when either KaggleUser or KaggleKey is empty" + - "Findings tagged recon:kaggle; Source = https://www.kaggle.com/code/" + artifacts: + - path: "pkg/recon/sources/kaggle.go" + provides: "KaggleSource implementing recon.ReconSource" + key_links: + - from: "pkg/recon/sources/kaggle.go" + to: "pkg/recon/sources/httpclient.go" + via: "Client.Do with req.SetBasicAuth(user, key)" + pattern: "SetBasicAuth" +--- + + +Implement KaggleSource querying Kaggle's public REST API for public notebooks +(kernels). Kaggle uses HTTP Basic auth (username + API key from kaggle.json). + +Purpose: RECON-CODE-09. +Output: pkg/recon/sources/kaggle.go + tests. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@pkg/recon/source.go +@pkg/recon/sources/httpclient.go + + +Kaggle API (docs: https://www.kaggle.com/docs/api): + GET https://www.kaggle.com/api/v1/kernels/list?search=&pageSize=50 + Auth: HTTP Basic (username:key) + Response: array of { "ref": "owner/kernel-slug", "title": "...", "author": "..." } + URL derivation: https://www.kaggle.com/code/ + +Rate limit: 60/min → rate.Every(1*time.Second), burst 1. + + + + + + + Task 1: KaggleSource + tests + pkg/recon/sources/kaggle.go, pkg/recon/sources/kaggle_test.go + + - Test A: Enabled false when User empty; false when Key empty; true when both set + - Test B: Sweep sets Basic auth header via req.SetBasicAuth(user, key) + - Test C: Decodes array of {ref} → Findings with Source = baseURL + "/code/" + ref, SourceType="recon:kaggle" + - Test D: 401 → ErrUnauthorized + - Test E: Ctx cancellation + - Test F: Missing creds → Sweep returns nil immediately (no HTTP calls, verified via counter=0) + + + Create `pkg/recon/sources/kaggle.go`: + - Struct `KaggleSource { User, Key, BaseURL, WebBaseURL string; Registry *providers.Registry; Limiters *recon.LimiterRegistry; client *Client }` + - Default BaseURL `https://www.kaggle.com`, WebBaseURL same + - Name "kaggle", RateLimit rate.Every(1*time.Second), Burst 1, RespectsRobots false + - Enabled = s.User != "" && s.Key != "" + - Sweep: for each query from BuildQueries(reg, "kaggle"), build + `{base}/api/v1/kernels/list?search=&pageSize=50`, call req.SetBasicAuth(User, Key), + client.Do, decode `[]struct{ Ref string "json:ref" }`, emit Findings + - Compile-time assert + + Create `pkg/recon/sources/kaggle_test.go`: + - httptest server that validates Authorization header starts with "Basic " and + decodes to "testuser:testkey" + - Returns JSON array with 2 refs + - Assert 2 Findings with expected Source URLs + - Missing-creds test: Sweep returns nil, handler never called (use atomic counter) + - 401 and cancellation tests + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestKaggle -v -timeout 30s + + + KaggleSource passes all tests, implements ReconSource. + + + + + + +- `go test ./pkg/recon/sources/ -run TestKaggle -v` + + + +RECON-CODE-09 satisfied. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md`. + diff --git a/.planning/phases/10-osint-code-hosting/10-09-PLAN.md b/.planning/phases/10-osint-code-hosting/10-09-PLAN.md new file mode 100644 index 0000000..984ccde --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/10-09-PLAN.md @@ -0,0 +1,227 @@ +--- +phase: 10-osint-code-hosting +plan: 09 +type: execute +wave: 3 +depends_on: [10-01, 10-02, 10-03, 10-04, 10-05, 10-06, 10-07, 10-08] +files_modified: + - pkg/recon/sources/register.go + - pkg/recon/sources/register_test.go + - pkg/recon/sources/integration_test.go + - cmd/recon.go +autonomous: true +requirements: [] +must_haves: + truths: + - "RegisterAll wires all 10 Phase 10 sources onto a recon.Engine" + - "cmd/recon.go buildReconEngine() reads viper config + env vars for tokens and calls RegisterAll" + - "Integration test spins up httptest servers for all sources, runs SweepAll via Engine, asserts Findings from each source arrive with correct SourceType" + - "Guardrail: enabling a source without its required credential logs a skip but does not error" + artifacts: + - path: "pkg/recon/sources/register.go" + provides: "RegisterAll with 10 source constructors wired" + contains: "engine.Register" + - path: "pkg/recon/sources/integration_test.go" + provides: "End-to-end SweepAll test with httptest fixtures for every source" + - path: "cmd/recon.go" + provides: "CLI reads config and invokes sources.RegisterAll" + key_links: + - from: "cmd/recon.go" + to: "pkg/recon/sources.RegisterAll" + via: "sources.RegisterAll(eng, cfg)" + pattern: "sources\\.RegisterAll" + - from: "pkg/recon/sources/register.go" + to: "pkg/recon.Engine.Register" + via: "engine.Register(source)" + pattern: "engine\\.Register" +--- + + +Final Wave 3 plan: wire every Phase 10 source into `sources.RegisterAll`, update +`cmd/recon.go` to construct a real `SourcesConfig` from viper/env, and add an +end-to-end integration test that drives all 10 sources through recon.Engine.SweepAll +using httptest fixtures. + +Purpose: Users can run `keyhunter recon full --sources=github,gitlab,...` and get +actual findings from any Phase 10 source whose credential is configured. +Output: Wired register.go + cmd/recon.go + passing integration test. + + + +@$HOME/.claude/get-shit-done/workflows/execute-plan.md +@$HOME/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/10-osint-code-hosting/10-CONTEXT.md +@.planning/phases/10-osint-code-hosting/10-01-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-02-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-03-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-04-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-05-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-06-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-07-SUMMARY.md +@.planning/phases/10-osint-code-hosting/10-08-SUMMARY.md +@pkg/recon/engine.go +@pkg/recon/source.go +@pkg/providers/registry.go +@cmd/recon.go + + +After Wave 2, each source file in pkg/recon/sources/ exports a constructor +roughly of the form: + func NewGitHubSource(token, reg, lim) *GitHubSource + func NewGitLabSource(token, reg, lim) *GitLabSource + func NewBitbucketSource(token, workspace, reg, lim) *BitbucketSource + func NewGistSource(token, reg, lim) *GistSource + func NewCodebergSource(token, reg, lim) *CodebergSource + func NewHuggingFaceSource(token, reg, lim) *HuggingFaceSource + func NewReplitSource(reg, lim) *ReplitSource + func NewCodeSandboxSource(reg, lim) *CodeSandboxSource + func NewSandboxesSource(reg, lim) *SandboxesSource + func NewKaggleSource(user, key, reg, lim) *KaggleSource + +(Verify actual signatures when reading Wave 2 SUMMARYs before writing register.go.) + + + + + + + Task 1: Wire RegisterAll + register_test.go + pkg/recon/sources/register.go, pkg/recon/sources/register_test.go + + - Test A: RegisterAll with a fresh engine and empty SourcesConfig registers all 10 sources by name (GitHub/GitLab/Bitbucket/Gist/Codeberg/HuggingFace/Replit/CodeSandbox/Sandboxes/Kaggle) + - Test B: engine.List() returns all 10 source names in sorted order + - Test C: Calling RegisterAll(nil, cfg) is a no-op (no panic) + - Test D: Sources without creds are still registered but their Enabled() returns false + + + Rewrite `pkg/recon/sources/register.go` RegisterAll body to construct each + source with appropriate fields from SourcesConfig and call engine.Register: + ```go + func RegisterAll(engine *recon.Engine, cfg SourcesConfig) { + if engine == nil { return } + reg := cfg.Registry + lim := cfg.Limiters + engine.Register(NewGitHubSource(cfg.GitHubToken, reg, lim)) + engine.Register(NewGitLabSource(cfg.GitLabToken, reg, lim)) + engine.Register(NewBitbucketSource(cfg.BitbucketToken, cfg.BitbucketWorkspace, reg, lim)) + engine.Register(NewGistSource(cfg.GitHubToken, reg, lim)) + engine.Register(NewCodebergSource(cfg.CodebergToken, reg, lim)) + engine.Register(NewHuggingFaceSource(cfg.HuggingFaceToken, reg, lim)) + engine.Register(NewReplitSource(reg, lim)) + engine.Register(NewCodeSandboxSource(reg, lim)) + engine.Register(NewSandboxesSource(reg, lim)) + engine.Register(NewKaggleSource(cfg.KaggleUser, cfg.KaggleKey, reg, lim)) + } + ``` + + Extend SourcesConfig with any fields Wave 2 introduced (BitbucketWorkspace, + CodebergToken). Adjust field names to actual Wave 2 SUMMARY signatures. + + Create `pkg/recon/sources/register_test.go`: + - Build minimal registry via providers.NewRegistryFromProviders with 1 synthetic provider + - Build recon.Engine, call RegisterAll with cfg having all creds empty + - Assert eng.List() returns exactly these 10 names: + bitbucket, codeberg, codesandbox, gist, github, gitlab, huggingface, kaggle, replit, sandboxes + - Assert nil engine call is no-op (no panic) + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestRegisterAll -v -timeout 30s + + + RegisterAll wires all 10 sources; registry_test green. + + + + + Task 2: Integration test across all sources + cmd/recon.go wiring + pkg/recon/sources/integration_test.go, cmd/recon.go + + - Integration test: spins up 10 httptest servers (or one multiplexed server with per-path routing) that return canned responses for each source's endpoints + - Uses BaseURL overrides on each source (direct construction, not RegisterAll, since RegisterAll uses production URLs) + - Registers each override-configured source on a fresh recon.Engine and calls SweepAll + - Asserts at least 1 Finding emerged for each of the 10 SourceType values: recon:github, recon:gitlab, recon:bitbucket, recon:gist, recon:codeberg, recon:huggingface, recon:replit, recon:codesandbox, recon:sandboxes, recon:kaggle + - CLI: `keyhunter recon list` (after wiring) prints all 10 source names in addition to "example" + + + Create `pkg/recon/sources/integration_test.go`: + - Build a single httptest server with a mux routing per-path: + `/search/code` (github) → ghSearchResponse JSON + `/api/v4/search` (gitlab) → blob array JSON + `/2.0/workspaces/ws/search/code` (bitbucket) → values JSON + `/gists/public` + `/raw/gist1` (gist) → gist list + raw matching keyword + `/api/v1/repos/search` (codeberg) → data array + `/api/spaces`, `/api/models` (huggingface) → id arrays + `/search?q=...&type=repls` (replit) → HTML fixture + `/search?query=...&type=sandboxes` (codesandbox) → HTML fixture + `/codepen-search` (sandboxes sub) → HTML; `/jsfiddle-search` → JSON + `/api/v1/kernels/list` (kaggle) → ref array + - For each source, construct with BaseURL/Platforms overrides pointing at test server + - Register all on a fresh recon.Engine + - Provide synthetic providers.Registry with keyword "sk-proj-" matching openai + - Call eng.SweepAll(ctx, recon.Config{Query:"ignored"}) + - Assert findings grouped by SourceType covers all 10 expected values + - Use a 30s test timeout + + Update `cmd/recon.go`: + - Import `github.com/salvacybersec/keyhunter/pkg/recon/sources`, `github.com/spf13/viper`, and the providers package + - In `buildReconEngine()`: + ```go + func buildReconEngine() *recon.Engine { + e := recon.NewEngine() + e.Register(recon.ExampleSource{}) + reg, err := providers.NewRegistry() + if err != nil { + fmt.Fprintf(os.Stderr, "recon: failed to load providers: %v\n", err) + return e + } + cfg := sources.SourcesConfig{ + Registry: reg, + Limiters: recon.NewLimiterRegistry(), + GitHubToken: firstNonEmpty(os.Getenv("GITHUB_TOKEN"), viper.GetString("recon.github.token")), + GitLabToken: firstNonEmpty(os.Getenv("GITLAB_TOKEN"), viper.GetString("recon.gitlab.token")), + BitbucketToken: firstNonEmpty(os.Getenv("BITBUCKET_TOKEN"), viper.GetString("recon.bitbucket.token")), + BitbucketWorkspace: viper.GetString("recon.bitbucket.workspace"), + CodebergToken: firstNonEmpty(os.Getenv("CODEBERG_TOKEN"), viper.GetString("recon.codeberg.token")), + HuggingFaceToken: firstNonEmpty(os.Getenv("HUGGINGFACE_TOKEN"), viper.GetString("recon.huggingface.token")), + KaggleUser: firstNonEmpty(os.Getenv("KAGGLE_USERNAME"), viper.GetString("recon.kaggle.username")), + KaggleKey: firstNonEmpty(os.Getenv("KAGGLE_KEY"), viper.GetString("recon.kaggle.key")), + } + sources.RegisterAll(e, cfg) + return e + } + + func firstNonEmpty(a, b string) string { if a != "" { return a }; return b } + ``` + - Preserve existing reconFullCmd / reconListCmd behavior. + + + cd /home/salva/Documents/apikey && go test ./pkg/recon/sources/ -run TestIntegration -v -timeout 60s && go build ./... && go run . recon list | sort + + + Integration test passes with at least one Finding per SourceType across all 10 + sources. `keyhunter recon list` prints all 10 source names plus "example". + + + + + + +- `go build ./...` +- `go vet ./...` +- `go test ./pkg/recon/sources/... -v -timeout 60s` +- `go test ./pkg/recon/... -timeout 60s` (ensure no regression in Phase 9 recon tests) +- `go run . recon list` prints all 10 new source names + + + +All Phase 10 code hosting sources registered via sources.RegisterAll, wired into +cmd/recon.go, and exercised end-to-end by an integration test hitting httptest +fixtures for every source. Phase 10 requirements RECON-CODE-01..10 complete. + + + +After completion, create `.planning/phases/10-osint-code-hosting/10-09-SUMMARY.md`. +