Files
keyhunter/pkg/recon/sources/integration_test.go

241 lines
7.6 KiB
Go

package sources
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest
// server that serves canned fixtures for every Phase 10 code-hosting source,
// registers the sources (with BaseURL overrides pointing at the test server)
// onto a fresh recon.Engine, runs SweepAll, and asserts at least one Finding
// was emitted per SourceType across all ten sources.
//
// RegisterAll cannot be used directly because it wires production URLs; the
// test exercises the same code paths by constructing each source identically
// to RegisterAll but with BaseURL/Platforms overrides.
func TestIntegration_AllSources_SweepAll(t *testing.T) {
mux := http.NewServeMux()
// ---- GitHub /search/code ----
mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(ghSearchResponse{
Items: []ghCodeItem{
{HTMLURL: "https://github.com/alice/leak/blob/main/.env"},
},
})
})
// ---- GitLab /api/v4/search ----
mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`))
})
// ---- Bitbucket /2.0/workspaces/<ws>/search/code ----
mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`))
})
// ---- Gist /gists/public + raw content ----
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r))
_, _ = w.Write([]byte(body))
})
mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("api_key = sk-proj-ABCDEF"))
})
// ---- Codeberg /api/v1/repos/search ----
mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`))
})
// ---- HuggingFace /api/spaces + /api/models ----
hfHandler := func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`))
}
mux.HandleFunc("/api/spaces", hfHandler)
mux.HandleFunc("/api/models", hfHandler)
// ---- Replit /search?q=...&type=repls (HTML) ----
// ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ----
// Both hit the same /search path; distinguish on query params.
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
switch r.URL.Query().Get("type") {
case "repls":
_, _ = w.Write([]byte(`<html><body>
<a href="/@alice/leaky-repl">hit</a>
<a href="/other/path">skip</a>
</body></html>`))
case "sandboxes":
_, _ = w.Write([]byte(`<html><body>
<a href="/s/leaky-sandbox">hit</a>
<a href="/other">skip</a>
</body></html>`))
default:
w.WriteHeader(http.StatusNotFound)
}
})
// ---- SandboxesSource sub-platforms ----
mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/alice/pen/AbCd1234">hit</a></body></html>`))
})
mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`))
})
// ---- Kaggle /api/v1/kernels/list ----
mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
reg := providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
lim := recon.NewLimiterRegistry()
eng := recon.NewEngine()
// GitHub — token + BaseURL override. Use the real constructor so `client`
// is initialized, then retarget BaseURL at the test server.
ghs := NewGitHubSource("ghp-test", reg, lim)
ghs.BaseURL = srv.URL
eng.Register(ghs)
// GitLab
eng.Register(&GitLabSource{
Token: "glpat-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Bitbucket
eng.Register(&BitbucketSource{
Token: "bb-test",
Workspace: "kh-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Gist — uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
eng.Register(&GistSource{
Token: "ghp-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Codeberg
eng.Register(&CodebergSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// HuggingFace
eng.Register(NewHuggingFaceSource(HuggingFaceConfig{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
}))
// Replit
eng.Register(&ReplitSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// CodeSandbox
eng.Register(&CodeSandboxSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Sandboxes — inject test sub-platforms that hit srv.URL.
eng.Register(&SandboxesSource{
Platforms: []subPlatform{
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
{Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"},
},
Registry: reg,
Limiters: lim,
Client: NewClient(),
BaseURL: srv.URL,
})
// Kaggle
eng.Register(&KaggleSource{
User: "kh-user",
Key: "kh-key",
BaseURL: srv.URL,
WebBaseURL: "https://www.kaggle.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// Sanity: all 10 sources registered.
if n := len(eng.List()); n != 10 {
t.Fatalf("expected 10 sources on engine, got %d: %v", n, eng.List())
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"})
if err != nil {
t.Fatalf("SweepAll returned error: %v", err)
}
// Group findings by SourceType and assert every expected bucket is present.
byType := make(map[string]int)
for _, f := range findings {
byType[f.SourceType]++
}
wantTypes := []string{
"recon:github",
"recon:gitlab",
"recon:bitbucket",
"recon:gist",
"recon:codeberg",
"recon:huggingface",
"recon:replit",
"recon:codesandbox",
"recon:sandboxes",
"recon:kaggle",
}
for _, st := range wantTypes {
if byType[st] == 0 {
t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings)
}
}
}
// baseFromReq reconstructs the scheme+host of the inbound request so handlers
// can build absolute raw URLs pointing back at the same httptest server.
func baseFromReq(r *http.Request) string {
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return scheme + "://" + r.Host
}