- Extend httptest mux with fixtures for Google, Bing, DuckDuckGo, Yandex, Brave - Add Pastebin (routed /pb/), GistPaste (/gp/), PasteSites (injected platform) - Assert all 18 SourceTypes emit at least one finding via SweepAll
364 lines
12 KiB
Go
364 lines
12 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
|
)
|
|
|
|
// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest
|
|
// server that serves canned fixtures for every Phase 10 code-hosting source
|
|
// and Phase 11 search engine / paste site source, registers the sources (with
|
|
// BaseURL overrides pointing at the test server) onto a fresh recon.Engine,
|
|
// runs SweepAll, and asserts at least one Finding was emitted per SourceType
|
|
// across all 18 sources.
|
|
//
|
|
// RegisterAll cannot be used directly because it wires production URLs; the
|
|
// test exercises the same code paths by constructing each source identically
|
|
// to RegisterAll but with BaseURL/Platforms overrides.
|
|
func TestIntegration_AllSources_SweepAll(t *testing.T) {
|
|
mux := http.NewServeMux()
|
|
|
|
// ---- GitHub /search/code ----
|
|
mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(ghSearchResponse{
|
|
Items: []ghCodeItem{
|
|
{HTMLURL: "https://github.com/alice/leak/blob/main/.env"},
|
|
},
|
|
})
|
|
})
|
|
|
|
// ---- GitLab /api/v4/search ----
|
|
mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`))
|
|
})
|
|
|
|
// ---- Bitbucket /2.0/workspaces/<ws>/search/code ----
|
|
mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`))
|
|
})
|
|
|
|
// ---- Gist /gists/public + raw content ----
|
|
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r))
|
|
_, _ = w.Write([]byte(body))
|
|
})
|
|
mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) {
|
|
_, _ = w.Write([]byte("api_key = sk-proj-ABCDEF"))
|
|
})
|
|
|
|
// ---- Codeberg /api/v1/repos/search ----
|
|
mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`))
|
|
})
|
|
|
|
// ---- HuggingFace /api/spaces + /api/models ----
|
|
hfHandler := func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`))
|
|
}
|
|
mux.HandleFunc("/api/spaces", hfHandler)
|
|
mux.HandleFunc("/api/models", hfHandler)
|
|
|
|
// ---- Replit /search?q=...&type=repls (HTML) ----
|
|
// ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ----
|
|
// Both hit the same /search path; distinguish on query params.
|
|
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
switch r.URL.Query().Get("type") {
|
|
case "repls":
|
|
_, _ = w.Write([]byte(`<html><body>
|
|
<a href="/@alice/leaky-repl">hit</a>
|
|
<a href="/other/path">skip</a>
|
|
</body></html>`))
|
|
case "sandboxes":
|
|
_, _ = w.Write([]byte(`<html><body>
|
|
<a href="/s/leaky-sandbox">hit</a>
|
|
<a href="/other">skip</a>
|
|
</body></html>`))
|
|
default:
|
|
w.WriteHeader(http.StatusNotFound)
|
|
}
|
|
})
|
|
|
|
// ---- SandboxesSource sub-platforms ----
|
|
mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><body><a href="/alice/pen/AbCd1234">hit</a></body></html>`))
|
|
})
|
|
mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`))
|
|
})
|
|
|
|
// ---- Kaggle /api/v1/kernels/list ----
|
|
mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`))
|
|
})
|
|
|
|
// ---- Phase 11: Google Custom Search /customsearch/v1 ----
|
|
mux.HandleFunc("/customsearch/v1", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"items":[{"link":"https://pastebin.com/abc123","title":"leak","snippet":"sk-proj-xxx"}]}`))
|
|
})
|
|
|
|
// ---- Phase 11: Bing /v7.0/search ----
|
|
mux.HandleFunc("/v7.0/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"webPages":{"value":[{"url":"https://example.com/bing-leak","name":"leak"}]}}`))
|
|
})
|
|
|
|
// ---- Phase 11: DuckDuckGo /html/ ----
|
|
mux.HandleFunc("/html/", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><body><a class="result__a" href="https://example.com/ddg-leak">result</a></body></html>`))
|
|
})
|
|
|
|
// ---- Phase 11: Yandex /search/xml ----
|
|
mux.HandleFunc("/search/xml", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/xml")
|
|
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="utf-8"?>
|
|
<yandexsearch><response><results><grouping><group><doc><url>https://example.com/yandex-leak</url></doc></group></grouping></results></response></yandexsearch>`))
|
|
})
|
|
|
|
// ---- Phase 11: Brave /res/v1/web/search ----
|
|
mux.HandleFunc("/res/v1/web/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"web":{"results":[{"url":"https://example.com/brave-leak","title":"leak"}]}}`))
|
|
})
|
|
|
|
// ---- Phase 11: Pastebin (routed under /pb/ prefix) ----
|
|
mux.HandleFunc("/pb/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><body><a href="/AbCdEf12">paste1</a></body></html>`))
|
|
})
|
|
mux.HandleFunc("/pb/raw/AbCdEf12", func(w http.ResponseWriter, r *http.Request) {
|
|
_, _ = w.Write([]byte("leaked key: sk-proj-PASTEBIN123"))
|
|
})
|
|
|
|
// ---- Phase 11: GistPaste (routed under /gp/ prefix) ----
|
|
mux.HandleFunc("/gp/search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><body><a href="/alice/deadbeef01">gist1</a></body></html>`))
|
|
})
|
|
mux.HandleFunc("/gp/alice/deadbeef01/raw", func(w http.ResponseWriter, r *http.Request) {
|
|
_, _ = w.Write([]byte("leaked: sk-proj-GISTPASTE456"))
|
|
})
|
|
|
|
// ---- Phase 11: PasteSites sub-platforms ----
|
|
mux.HandleFunc("/paste-search", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "text/html")
|
|
_, _ = w.Write([]byte(`<html><body><a href="/aB3xZ9">paste</a></body></html>`))
|
|
})
|
|
mux.HandleFunc("/paste-raw/aB3xZ9", func(w http.ResponseWriter, r *http.Request) {
|
|
_, _ = w.Write([]byte("secret: sk-proj-PASTESITES789"))
|
|
})
|
|
|
|
srv := httptest.NewServer(mux)
|
|
defer srv.Close()
|
|
|
|
reg := providers.NewRegistryFromProviders([]providers.Provider{
|
|
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
|
})
|
|
lim := recon.NewLimiterRegistry()
|
|
|
|
eng := recon.NewEngine()
|
|
|
|
// --- Phase 10 sources ---
|
|
|
|
// GitHub -- token + BaseURL override. Use the real constructor so `client`
|
|
// is initialized, then retarget BaseURL at the test server.
|
|
ghs := NewGitHubSource("ghp-test", reg, lim)
|
|
ghs.BaseURL = srv.URL
|
|
eng.Register(ghs)
|
|
// GitLab
|
|
eng.Register(&GitLabSource{
|
|
Token: "glpat-test",
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// Bitbucket
|
|
eng.Register(&BitbucketSource{
|
|
Token: "bb-test",
|
|
Workspace: "kh-test",
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// Gist -- uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
|
|
eng.Register(&GistSource{
|
|
Token: "ghp-test",
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// Codeberg
|
|
eng.Register(&CodebergSource{
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// HuggingFace
|
|
eng.Register(NewHuggingFaceSource(HuggingFaceConfig{
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
}))
|
|
// Replit
|
|
eng.Register(&ReplitSource{
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// CodeSandbox
|
|
eng.Register(&CodeSandboxSource{
|
|
BaseURL: srv.URL,
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
})
|
|
// Sandboxes -- inject test sub-platforms that hit srv.URL.
|
|
eng.Register(&SandboxesSource{
|
|
Platforms: []subPlatform{
|
|
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
|
|
{Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"},
|
|
},
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
Client: NewClient(),
|
|
BaseURL: srv.URL,
|
|
})
|
|
// Kaggle
|
|
eng.Register(&KaggleSource{
|
|
User: "kh-user",
|
|
Key: "kh-key",
|
|
BaseURL: srv.URL,
|
|
WebBaseURL: "https://www.kaggle.com",
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
client: NewClient(),
|
|
})
|
|
|
|
// --- Phase 11 sources ---
|
|
|
|
// Google Custom Search
|
|
gs := NewGoogleDorkSource("test-api-key", "test-cx", reg, lim)
|
|
gs.BaseURL = srv.URL
|
|
eng.Register(gs)
|
|
// Bing
|
|
bs := NewBingDorkSource("test-bing-key", reg, lim)
|
|
bs.BaseURL = srv.URL
|
|
eng.Register(bs)
|
|
// DuckDuckGo
|
|
ddg := NewDuckDuckGoSource(reg, lim)
|
|
ddg.BaseURL = srv.URL
|
|
eng.Register(ddg)
|
|
// Yandex
|
|
ys := NewYandexSource("test-user", "test-key", reg, lim)
|
|
ys.BaseURL = srv.URL
|
|
eng.Register(ys)
|
|
// Brave
|
|
brs := NewBraveSource("test-brave-key", reg, lim)
|
|
brs.BaseURL = srv.URL
|
|
eng.Register(brs)
|
|
// Pastebin -- uses /pb/ prefix to avoid /search collision
|
|
eng.Register(&PastebinSource{
|
|
BaseURL: srv.URL + "/pb",
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
Client: NewClient(),
|
|
})
|
|
// GistPaste -- uses /gp/ prefix
|
|
eng.Register(&GistPasteSource{
|
|
BaseURL: srv.URL + "/gp",
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
Client: NewClient(),
|
|
})
|
|
// PasteSites -- inject test sub-platform
|
|
eng.Register(&PasteSitesSource{
|
|
Platforms: []pastePlatform{
|
|
{
|
|
Name: "testpaste",
|
|
SearchPath: "/paste-search?q=%s",
|
|
ResultLinkRegex: `^/[a-zA-Z0-9]+$`,
|
|
RawPathTemplate: "/paste-raw%s",
|
|
},
|
|
},
|
|
Registry: reg,
|
|
Limiters: lim,
|
|
Client: NewClient(),
|
|
BaseURL: srv.URL,
|
|
})
|
|
|
|
// Sanity: all 18 sources registered.
|
|
if n := len(eng.List()); n != 18 {
|
|
t.Fatalf("expected 18 sources on engine, got %d: %v", n, eng.List())
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
|
defer cancel()
|
|
|
|
findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"})
|
|
if err != nil {
|
|
t.Fatalf("SweepAll returned error: %v", err)
|
|
}
|
|
|
|
// Group findings by SourceType and assert every expected bucket is present.
|
|
byType := make(map[string]int)
|
|
for _, f := range findings {
|
|
byType[f.SourceType]++
|
|
}
|
|
|
|
wantTypes := []string{
|
|
// Phase 10
|
|
"recon:github",
|
|
"recon:gitlab",
|
|
"recon:bitbucket",
|
|
"recon:gist",
|
|
"recon:codeberg",
|
|
"recon:huggingface",
|
|
"recon:replit",
|
|
"recon:codesandbox",
|
|
"recon:sandboxes",
|
|
"recon:kaggle",
|
|
// Phase 11
|
|
"recon:google",
|
|
"recon:bing",
|
|
"recon:duckduckgo",
|
|
"recon:yandex",
|
|
"recon:brave",
|
|
"recon:pastebin",
|
|
"recon:gistpaste",
|
|
"recon:pastesites",
|
|
}
|
|
for _, st := range wantTypes {
|
|
if byType[st] == 0 {
|
|
t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings)
|
|
}
|
|
}
|
|
}
|
|
|
|
// baseFromReq reconstructs the scheme+host of the inbound request so handlers
|
|
// can build absolute raw URLs pointing back at the same httptest server.
|
|
func baseFromReq(r *http.Request) string {
|
|
scheme := "http"
|
|
if r.TLS != nil {
|
|
scheme = "https"
|
|
}
|
|
return scheme + "://" + r.Host
|
|
}
|