package sources import ( "context" "encoding/json" "fmt" "net/http" "net/http/httptest" "testing" "time" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest // server that serves canned fixtures for every Phase 10 code-hosting source // and Phase 11 search engine / paste site source, registers the sources (with // BaseURL overrides pointing at the test server) onto a fresh recon.Engine, // runs SweepAll, and asserts at least one Finding was emitted per SourceType // across all 18 sources. // // RegisterAll cannot be used directly because it wires production URLs; the // test exercises the same code paths by constructing each source identically // to RegisterAll but with BaseURL/Platforms overrides. func TestIntegration_AllSources_SweepAll(t *testing.T) { mux := http.NewServeMux() // ---- GitHub /search/code ---- mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(ghSearchResponse{ Items: []ghCodeItem{ {HTMLURL: "https://github.com/alice/leak/blob/main/.env"}, }, }) }) // ---- GitLab /api/v4/search ---- mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`)) }) // ---- Bitbucket /2.0/workspaces//search/code ---- mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`)) }) // ---- Gist /gists/public + raw content ---- mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r)) _, _ = w.Write([]byte(body)) }) mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("api_key = sk-proj-ABCDEF")) }) // ---- Codeberg /api/v1/repos/search ---- mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`)) }) // ---- HuggingFace /api/spaces + /api/models ---- hfHandler := func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`)) } mux.HandleFunc("/api/spaces", hfHandler) mux.HandleFunc("/api/models", hfHandler) // ---- Replit /search?q=...&type=repls (HTML) ---- // ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ---- // Both hit the same /search path; distinguish on query params. mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") switch r.URL.Query().Get("type") { case "repls": _, _ = w.Write([]byte(` hit skip `)) case "sandboxes": _, _ = w.Write([]byte(` hit skip `)) default: w.WriteHeader(http.StatusNotFound) } }) // ---- SandboxesSource sub-platforms ---- mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`hit`)) }) mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`)) }) // ---- Kaggle /api/v1/kernels/list ---- mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`)) }) // ---- Phase 11: Google Custom Search /customsearch/v1 ---- mux.HandleFunc("/customsearch/v1", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"items":[{"link":"https://pastebin.com/abc123","title":"leak","snippet":"sk-proj-xxx"}]}`)) }) // ---- Phase 11: Bing /v7.0/search ---- mux.HandleFunc("/v7.0/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"webPages":{"value":[{"url":"https://example.com/bing-leak","name":"leak"}]}}`)) }) // ---- Phase 11: DuckDuckGo /html/ ---- mux.HandleFunc("/html/", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`result`)) }) // ---- Phase 11: Yandex /search/xml ---- mux.HandleFunc("/search/xml", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/xml") _, _ = w.Write([]byte(` https://example.com/yandex-leak`)) }) // ---- Phase 11: Brave /res/v1/web/search ---- mux.HandleFunc("/res/v1/web/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"web":{"results":[{"url":"https://example.com/brave-leak","title":"leak"}]}}`)) }) // ---- Phase 11: Pastebin (routed under /pb/ prefix) ---- mux.HandleFunc("/pb/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`paste1`)) }) mux.HandleFunc("/pb/raw/AbCdEf12", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("leaked key: sk-proj-PASTEBIN123")) }) // ---- Phase 11: GistPaste (routed under /gp/ prefix) ---- mux.HandleFunc("/gp/search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`gist1`)) }) mux.HandleFunc("/gp/alice/deadbeef01/raw", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("leaked: sk-proj-GISTPASTE456")) }) // ---- Phase 11: PasteSites sub-platforms ---- mux.HandleFunc("/paste-search", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") _, _ = w.Write([]byte(`paste`)) }) mux.HandleFunc("/paste-raw/aB3xZ9", func(w http.ResponseWriter, r *http.Request) { _, _ = w.Write([]byte("secret: sk-proj-PASTESITES789")) }) srv := httptest.NewServer(mux) defer srv.Close() reg := providers.NewRegistryFromProviders([]providers.Provider{ {Name: "openai", Keywords: []string{"sk-proj-"}}, }) lim := recon.NewLimiterRegistry() eng := recon.NewEngine() // --- Phase 10 sources --- // GitHub -- token + BaseURL override. Use the real constructor so `client` // is initialized, then retarget BaseURL at the test server. ghs := NewGitHubSource("ghp-test", reg, lim) ghs.BaseURL = srv.URL eng.Register(ghs) // GitLab eng.Register(&GitLabSource{ Token: "glpat-test", BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // Bitbucket eng.Register(&BitbucketSource{ Token: "bb-test", Workspace: "kh-test", BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // Gist -- uses same BaseURL for /gists/public; raw URLs are absolute in fixture. eng.Register(&GistSource{ Token: "ghp-test", BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // Codeberg eng.Register(&CodebergSource{ BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // HuggingFace eng.Register(NewHuggingFaceSource(HuggingFaceConfig{ BaseURL: srv.URL, Registry: reg, Limiters: lim, })) // Replit eng.Register(&ReplitSource{ BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // CodeSandbox eng.Register(&CodeSandboxSource{ BaseURL: srv.URL, Registry: reg, Limiters: lim, }) // Sandboxes -- inject test sub-platforms that hit srv.URL. eng.Register(&SandboxesSource{ Platforms: []subPlatform{ {Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false}, {Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"}, }, Registry: reg, Limiters: lim, Client: NewClient(), BaseURL: srv.URL, }) // Kaggle eng.Register(&KaggleSource{ User: "kh-user", Key: "kh-key", BaseURL: srv.URL, WebBaseURL: "https://www.kaggle.com", Registry: reg, Limiters: lim, client: NewClient(), }) // --- Phase 11 sources --- // Google Custom Search gs := NewGoogleDorkSource("test-api-key", "test-cx", reg, lim) gs.BaseURL = srv.URL eng.Register(gs) // Bing bs := NewBingDorkSource("test-bing-key", reg, lim) bs.BaseURL = srv.URL eng.Register(bs) // DuckDuckGo ddg := NewDuckDuckGoSource(reg, lim) ddg.BaseURL = srv.URL eng.Register(ddg) // Yandex ys := NewYandexSource("test-user", "test-key", reg, lim) ys.BaseURL = srv.URL eng.Register(ys) // Brave brs := NewBraveSource("test-brave-key", reg, lim) brs.BaseURL = srv.URL eng.Register(brs) // Pastebin -- uses /pb/ prefix to avoid /search collision eng.Register(&PastebinSource{ BaseURL: srv.URL + "/pb", Registry: reg, Limiters: lim, Client: NewClient(), }) // GistPaste -- uses /gp/ prefix eng.Register(&GistPasteSource{ BaseURL: srv.URL + "/gp", Registry: reg, Limiters: lim, Client: NewClient(), }) // PasteSites -- inject test sub-platform eng.Register(&PasteSitesSource{ Platforms: []pastePlatform{ { Name: "testpaste", SearchPath: "/paste-search?q=%s", ResultLinkRegex: `^/[a-zA-Z0-9]+$`, RawPathTemplate: "/paste-raw%s", }, }, Registry: reg, Limiters: lim, Client: NewClient(), BaseURL: srv.URL, }) // Sanity: all 18 sources registered. if n := len(eng.List()); n != 18 { t.Fatalf("expected 18 sources on engine, got %d: %v", n, eng.List()) } ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) defer cancel() findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"}) if err != nil { t.Fatalf("SweepAll returned error: %v", err) } // Group findings by SourceType and assert every expected bucket is present. byType := make(map[string]int) for _, f := range findings { byType[f.SourceType]++ } wantTypes := []string{ // Phase 10 "recon:github", "recon:gitlab", "recon:bitbucket", "recon:gist", "recon:codeberg", "recon:huggingface", "recon:replit", "recon:codesandbox", "recon:sandboxes", "recon:kaggle", // Phase 11 "recon:google", "recon:bing", "recon:duckduckgo", "recon:yandex", "recon:brave", "recon:pastebin", "recon:gistpaste", "recon:pastesites", } for _, st := range wantTypes { if byType[st] == 0 { t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings) } } } // baseFromReq reconstructs the scheme+host of the inbound request so handlers // can build absolute raw URLs pointing back at the same httptest server. func baseFromReq(r *http.Request) string { scheme := "http" if r.TLS != nil { scheme = "https" } return scheme + "://" + r.Host }