diff --git a/.planning/phases/10-osint-code-hosting/deferred-items.md b/.planning/phases/10-osint-code-hosting/deferred-items.md new file mode 100644 index 0000000..ee093bd --- /dev/null +++ b/.planning/phases/10-osint-code-hosting/deferred-items.md @@ -0,0 +1,13 @@ +# Phase 10 — Deferred Items + +Out-of-scope findings discovered during plan execution. These are NOT fixed in +the current plan but are tracked here for future work. + +## 10-09 + +- **GitHubSource struct-literal panic risk.** `GitHubSource.Sweep` dereferences + `s.client` without a nil check (pkg/recon/sources/github.go:106). `NewGitHubSource` + initializes `client`, so `RegisterAll` is safe, but any future caller using a + struct literal (as sibling sources do) will panic. Fix: add + `if s.client == nil { s.client = NewClient() }` at the top of Sweep. Siblings + (GitLab, Bitbucket, Gist, Codeberg, HuggingFace, Kaggle) already lazy-init. diff --git a/pkg/recon/sources/integration_test.go b/pkg/recon/sources/integration_test.go new file mode 100644 index 0000000..7cd0285 --- /dev/null +++ b/pkg/recon/sources/integration_test.go @@ -0,0 +1,240 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest +// server that serves canned fixtures for every Phase 10 code-hosting source, +// registers the sources (with BaseURL overrides pointing at the test server) +// onto a fresh recon.Engine, runs SweepAll, and asserts at least one Finding +// was emitted per SourceType across all ten sources. +// +// RegisterAll cannot be used directly because it wires production URLs; the +// test exercises the same code paths by constructing each source identically +// to RegisterAll but with BaseURL/Platforms overrides. +func TestIntegration_AllSources_SweepAll(t *testing.T) { + mux := http.NewServeMux() + + // ---- GitHub /search/code ---- + mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(ghSearchResponse{ + Items: []ghCodeItem{ + {HTMLURL: "https://github.com/alice/leak/blob/main/.env"}, + }, + }) + }) + + // ---- GitLab /api/v4/search ---- + mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`)) + }) + + // ---- Bitbucket /2.0/workspaces//search/code ---- + mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`)) + }) + + // ---- Gist /gists/public + raw content ---- + mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r)) + _, _ = w.Write([]byte(body)) + }) + mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte("api_key = sk-proj-ABCDEF")) + }) + + // ---- Codeberg /api/v1/repos/search ---- + mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`)) + }) + + // ---- HuggingFace /api/spaces + /api/models ---- + hfHandler := func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`)) + } + mux.HandleFunc("/api/spaces", hfHandler) + mux.HandleFunc("/api/models", hfHandler) + + // ---- Replit /search?q=...&type=repls (HTML) ---- + // ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ---- + // Both hit the same /search path; distinguish on query params. + mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + switch r.URL.Query().Get("type") { + case "repls": + _, _ = w.Write([]byte(` + hit + skip + `)) + case "sandboxes": + _, _ = w.Write([]byte(` + hit + skip + `)) + default: + w.WriteHeader(http.StatusNotFound) + } + }) + + // ---- SandboxesSource sub-platforms ---- + mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(`hit`)) + }) + mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`)) + }) + + // ---- Kaggle /api/v1/kernels/list ---- + mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`)) + }) + + srv := httptest.NewServer(mux) + defer srv.Close() + + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) + lim := recon.NewLimiterRegistry() + + eng := recon.NewEngine() + + // GitHub — token + BaseURL override. Use the real constructor so `client` + // is initialized, then retarget BaseURL at the test server. + ghs := NewGitHubSource("ghp-test", reg, lim) + ghs.BaseURL = srv.URL + eng.Register(ghs) + // GitLab + eng.Register(&GitLabSource{ + Token: "glpat-test", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // Bitbucket + eng.Register(&BitbucketSource{ + Token: "bb-test", + Workspace: "kh-test", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // Gist — uses same BaseURL for /gists/public; raw URLs are absolute in fixture. + eng.Register(&GistSource{ + Token: "ghp-test", + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // Codeberg + eng.Register(&CodebergSource{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // HuggingFace + eng.Register(NewHuggingFaceSource(HuggingFaceConfig{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + })) + // Replit + eng.Register(&ReplitSource{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // CodeSandbox + eng.Register(&CodeSandboxSource{ + BaseURL: srv.URL, + Registry: reg, + Limiters: lim, + }) + // Sandboxes — inject test sub-platforms that hit srv.URL. + eng.Register(&SandboxesSource{ + Platforms: []subPlatform{ + {Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false}, + {Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"}, + }, + Registry: reg, + Limiters: lim, + Client: NewClient(), + BaseURL: srv.URL, + }) + // Kaggle + eng.Register(&KaggleSource{ + User: "kh-user", + Key: "kh-key", + BaseURL: srv.URL, + WebBaseURL: "https://www.kaggle.com", + Registry: reg, + Limiters: lim, + client: NewClient(), + }) + + // Sanity: all 10 sources registered. + if n := len(eng.List()); n != 10 { + t.Fatalf("expected 10 sources on engine, got %d: %v", n, eng.List()) + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"}) + if err != nil { + t.Fatalf("SweepAll returned error: %v", err) + } + + // Group findings by SourceType and assert every expected bucket is present. + byType := make(map[string]int) + for _, f := range findings { + byType[f.SourceType]++ + } + + wantTypes := []string{ + "recon:github", + "recon:gitlab", + "recon:bitbucket", + "recon:gist", + "recon:codeberg", + "recon:huggingface", + "recon:replit", + "recon:codesandbox", + "recon:sandboxes", + "recon:kaggle", + } + for _, st := range wantTypes { + if byType[st] == 0 { + t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings) + } + } +} + +// baseFromReq reconstructs the scheme+host of the inbound request so handlers +// can build absolute raw URLs pointing back at the same httptest server. +func baseFromReq(r *http.Request) string { + scheme := "http" + if r.TLS != nil { + scheme = "https" + } + return scheme + "://" + r.Host +}