Files
keyhunter/pkg/recon/sources/integration_test.go
salvacybersec c16f5feaee feat(13-04): wire all 12 Phase 13 sources into RegisterAll (40 total)
- Add 8 package registry sources (npm, pypi, crates, rubygems, maven, nuget, goproxy, packagist)
- Update register_test to assert 40 sources in sorted list
- Update Phase 12 integration test count from 32 to 40
2026-04-06 12:59:11 +03:00

608 lines
19 KiB
Go

package sources
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// TestIntegration_AllSources_SweepAll spins up a single multiplexed httptest
// server that serves canned fixtures for every Phase 10 code-hosting source,
// Phase 11 search engine / paste site source, Phase 12 IoT scanner, and
// Phase 12 cloud storage source, registers the sources (with BaseURL overrides
// pointing at the test server) onto a fresh recon.Engine, runs SweepAll, and
// asserts at least one Finding was emitted per SourceType across all 28 sources.
//
// RegisterAll cannot be used directly because it wires production URLs; the
// test exercises the same code paths by constructing each source identically
// to RegisterAll but with BaseURL/Platforms overrides.
func TestIntegration_AllSources_SweepAll(t *testing.T) {
mux := http.NewServeMux()
// ---- GitHub /search/code ----
mux.HandleFunc("/search/code", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(ghSearchResponse{
Items: []ghCodeItem{
{HTMLURL: "https://github.com/alice/leak/blob/main/.env"},
},
})
})
// ---- GitLab /api/v4/search ----
mux.HandleFunc("/api/v4/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"basename":"keys","data":"sk-proj-abc","path":"keys.env","project_id":42,"ref":"main","startline":1}]`))
})
// ---- Bitbucket /2.0/workspaces/<ws>/search/code ----
mux.HandleFunc("/2.0/workspaces/kh-test/search/code", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"values":[{"content_match_count":1,"page_url":"https://bitbucket.org/kh-test/repo/src/main/keys.env","file":{"path":"keys.env","commit":{"hash":"deadbeef"}}}]}`))
})
// ---- Gist /gists/public + raw content ----
mux.HandleFunc("/gists/public", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
body := fmt.Sprintf(`[{"html_url":"https://gist.github.com/alice/gistleak","files":{"f.py":{"filename":"f.py","raw_url":"%s/raw/gist1"}}}]`, baseFromReq(r))
_, _ = w.Write([]byte(body))
})
mux.HandleFunc("/raw/gist1", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("api_key = sk-proj-ABCDEF"))
})
// ---- Codeberg /api/v1/repos/search ----
mux.HandleFunc("/api/v1/repos/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"ok":true,"data":[{"full_name":"bob/keys","html_url":"https://codeberg.org/bob/keys"}]}`))
})
// ---- HuggingFace /api/spaces + /api/models ----
hfHandler := func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"id":"alice/leaky-space"}]`))
}
mux.HandleFunc("/api/spaces", hfHandler)
mux.HandleFunc("/api/models", hfHandler)
// ---- Replit /search?q=...&type=repls (HTML) ----
// ---- CodeSandbox /search?query=...&type=sandboxes (HTML) ----
// Both hit the same /search path; distinguish on query params.
mux.HandleFunc("/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
switch r.URL.Query().Get("type") {
case "repls":
_, _ = w.Write([]byte(`<html><body>
<a href="/@alice/leaky-repl">hit</a>
<a href="/other/path">skip</a>
</body></html>`))
case "sandboxes":
_, _ = w.Write([]byte(`<html><body>
<a href="/s/leaky-sandbox">hit</a>
<a href="/other">skip</a>
</body></html>`))
default:
w.WriteHeader(http.StatusNotFound)
}
})
// ---- SandboxesSource sub-platforms ----
mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/alice/pen/AbCd1234">hit</a></body></html>`))
})
mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"results":[{"url":"https://jsfiddle.net/u/leaky/"}]}`))
})
// ---- Kaggle /api/v1/kernels/list ----
mux.HandleFunc("/api/v1/kernels/list", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`[{"ref":"alice/leaky-notebook"}]`))
})
// ---- Phase 11: Google Custom Search /customsearch/v1 ----
mux.HandleFunc("/customsearch/v1", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"items":[{"link":"https://pastebin.com/abc123","title":"leak","snippet":"sk-proj-xxx"}]}`))
})
// ---- Phase 11: Bing /v7.0/search ----
mux.HandleFunc("/v7.0/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"webPages":{"value":[{"url":"https://example.com/bing-leak","name":"leak"}]}}`))
})
// ---- Phase 11: DuckDuckGo /html/ ----
mux.HandleFunc("/html/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a class="result__a" href="https://example.com/ddg-leak">result</a></body></html>`))
})
// ---- Phase 11: Yandex /search/xml ----
mux.HandleFunc("/search/xml", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="utf-8"?>
<yandexsearch><response><results><grouping><group><doc><url>https://example.com/yandex-leak</url></doc></group></grouping></results></response></yandexsearch>`))
})
// ---- Phase 11: Brave /res/v1/web/search ----
mux.HandleFunc("/res/v1/web/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"web":{"results":[{"url":"https://example.com/brave-leak","title":"leak"}]}}`))
})
// ---- Phase 11: Pastebin (routed under /pb/ prefix) ----
mux.HandleFunc("/pb/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/AbCdEf12">paste1</a></body></html>`))
})
mux.HandleFunc("/pb/raw/AbCdEf12", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("leaked key: sk-proj-PASTEBIN123"))
})
// ---- Phase 11: GistPaste (routed under /gp/ prefix) ----
mux.HandleFunc("/gp/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/alice/deadbeef01">gist1</a></body></html>`))
})
mux.HandleFunc("/gp/alice/deadbeef01/raw", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("leaked: sk-proj-GISTPASTE456"))
})
// ---- Phase 11: PasteSites sub-platforms ----
mux.HandleFunc("/paste-search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(`<html><body><a href="/aB3xZ9">paste</a></body></html>`))
})
mux.HandleFunc("/paste-raw/aB3xZ9", func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("secret: sk-proj-PASTESITES789"))
})
// ---- Phase 12: Shodan /shodan/host/search ----
mux.HandleFunc("/shodan/host/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"matches":[{"ip_str":"1.2.3.4","port":8080,"data":"vllm endpoint"}]}`))
})
// ---- Phase 12: Censys /v2/hosts/search ----
mux.HandleFunc("/v2/hosts/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"result":{"hits":[{"ip":"10.0.0.1","services":[{"port":443,"service_name":"HTTP"}]}]}}`))
})
// ---- Phase 12: ZoomEye /host/search ----
mux.HandleFunc("/host/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"matches":[{"ip":"172.16.0.1","portinfo":{"port":8443,"service":"https"}}]}`))
})
// ---- Phase 12: FOFA /api/v1/search/all ----
mux.HandleFunc("/api/v1/search/all", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"results":[["example.com","192.168.1.1","443"]],"size":1}`))
})
// ---- Phase 12: Netlas /api/responses/ ----
mux.HandleFunc("/api/responses/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"items":[{"data":{"ip":"10.10.10.1","port":80}}]}`))
})
// ---- Phase 12: BinaryEdge /v2/query/search ----
mux.HandleFunc("/v2/query/search", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"events":[{"target":{"ip":"192.0.2.1","port":8080}}]}`))
})
// ---- Phase 12: Cloud storage — S3 + DOSpaces (S3 XML format) ----
mux.HandleFunc("/cloud-s3/", func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodHead {
w.WriteHeader(http.StatusOK)
return
}
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<ListBucketResult>
<Contents><Key>.env</Key></Contents>
<Contents><Key>config.yaml</Key></Contents>
</ListBucketResult>`))
})
// ---- Phase 12: Cloud storage — GCS (JSON format) ----
mux.HandleFunc("/cloud-gcs/", func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodHead {
w.WriteHeader(http.StatusOK)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"items":[{"name":".env"},{"name":"config.yaml"}]}`))
})
// ---- Phase 12: Cloud storage — Azure Blob (EnumerationResults XML) ----
mux.HandleFunc("/cloud-azure/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
_, _ = w.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<EnumerationResults>
<Blobs>
<Blob><Name>.env</Name></Blob>
<Blob><Name>config.yaml</Name></Blob>
</Blobs>
</EnumerationResults>`))
})
srv := httptest.NewServer(mux)
defer srv.Close()
reg := providers.NewRegistryFromProviders([]providers.Provider{
{Name: "openai", Keywords: []string{"sk-proj-"}},
})
lim := recon.NewLimiterRegistry()
eng := recon.NewEngine()
// --- Phase 10 sources ---
// GitHub -- token + BaseURL override. Use the real constructor so `client`
// is initialized, then retarget BaseURL at the test server.
ghs := NewGitHubSource("ghp-test", reg, lim)
ghs.BaseURL = srv.URL
eng.Register(ghs)
// GitLab
eng.Register(&GitLabSource{
Token: "glpat-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Bitbucket
eng.Register(&BitbucketSource{
Token: "bb-test",
Workspace: "kh-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Gist -- uses same BaseURL for /gists/public; raw URLs are absolute in fixture.
eng.Register(&GistSource{
Token: "ghp-test",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Codeberg
eng.Register(&CodebergSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// HuggingFace
eng.Register(NewHuggingFaceSource(HuggingFaceConfig{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
}))
// Replit
eng.Register(&ReplitSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// CodeSandbox
eng.Register(&CodeSandboxSource{
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
})
// Sandboxes -- inject test sub-platforms that hit srv.URL.
eng.Register(&SandboxesSource{
Platforms: []subPlatform{
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
{Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"},
},
Registry: reg,
Limiters: lim,
Client: NewClient(),
BaseURL: srv.URL,
})
// Kaggle
eng.Register(&KaggleSource{
User: "kh-user",
Key: "kh-key",
BaseURL: srv.URL,
WebBaseURL: "https://www.kaggle.com",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// --- Phase 11 sources ---
// Google Custom Search
gs := NewGoogleDorkSource("test-api-key", "test-cx", reg, lim)
gs.BaseURL = srv.URL
eng.Register(gs)
// Bing
bs := NewBingDorkSource("test-bing-key", reg, lim)
bs.BaseURL = srv.URL
eng.Register(bs)
// DuckDuckGo
ddg := NewDuckDuckGoSource(reg, lim)
ddg.BaseURL = srv.URL
eng.Register(ddg)
// Yandex
ys := NewYandexSource("test-user", "test-key", reg, lim)
ys.BaseURL = srv.URL
eng.Register(ys)
// Brave
brs := NewBraveSource("test-brave-key", reg, lim)
brs.BaseURL = srv.URL
eng.Register(brs)
// Pastebin -- uses /pb/ prefix to avoid /search collision
eng.Register(&PastebinSource{
BaseURL: srv.URL + "/pb",
Registry: reg,
Limiters: lim,
Client: NewClient(),
})
// GistPaste -- uses /gp/ prefix
eng.Register(&GistPasteSource{
BaseURL: srv.URL + "/gp",
Registry: reg,
Limiters: lim,
Client: NewClient(),
})
// PasteSites -- inject test sub-platform
eng.Register(&PasteSitesSource{
Platforms: []pastePlatform{
{
Name: "testpaste",
SearchPath: "/paste-search?q=%s",
ResultLinkRegex: `^/[a-zA-Z0-9]+$`,
RawPathTemplate: "/paste-raw%s",
},
},
Registry: reg,
Limiters: lim,
Client: NewClient(),
BaseURL: srv.URL,
})
// --- Phase 12: IoT scanner sources ---
// Shodan
shodanSrc := NewShodanSource("test-shodan-key", reg, lim)
shodanSrc.BaseURL = srv.URL
eng.Register(shodanSrc)
// Censys
censysSrc := NewCensysSource("test-id", "test-secret", reg, lim)
censysSrc.BaseURL = srv.URL
eng.Register(censysSrc)
// ZoomEye
zoomeyeSrc := NewZoomEyeSource("test-zoomeye-key", reg, lim)
zoomeyeSrc.BaseURL = srv.URL
eng.Register(zoomeyeSrc)
// FOFA
eng.Register(&FOFASource{
Email: "test@example.com",
APIKey: "test-fofa-key",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// Netlas
eng.Register(&NetlasSource{
APIKey: "test-netlas-key",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// BinaryEdge
eng.Register(&BinaryEdgeSource{
APIKey: "test-binaryedge-key",
BaseURL: srv.URL,
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// --- Phase 12: Cloud storage sources ---
// S3 -- BaseURL pattern with %s for bucket name
eng.Register(&S3Scanner{
BaseURL: srv.URL + "/cloud-s3/%s",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// GCS -- JSON format handler
eng.Register(&GCSScanner{
BaseURL: srv.URL + "/cloud-gcs/%s",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// AzureBlob -- EnumerationResults XML; needs two %s: account + container
eng.Register(&AzureBlobScanner{
BaseURL: srv.URL + "/cloud-azure/%s-%s",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// DOSpaces -- S3-compatible XML; needs two %s: bucket + region
eng.Register(&DOSpacesScanner{
BaseURL: srv.URL + "/cloud-s3/%s-%s",
Registry: reg,
Limiters: lim,
client: NewClient(),
})
// Sanity: all 28 sources registered.
if n := len(eng.List()); n != 28 {
t.Fatalf("expected 28 sources on engine, got %d: %v", n, eng.List())
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
findings, err := eng.SweepAll(ctx, recon.Config{Query: "ignored"})
if err != nil {
t.Fatalf("SweepAll returned error: %v", err)
}
// Group findings by SourceType and assert every expected bucket is present.
byType := make(map[string]int)
for _, f := range findings {
byType[f.SourceType]++
}
wantTypes := []string{
// Phase 10
"recon:github",
"recon:gitlab",
"recon:bitbucket",
"recon:gist",
"recon:codeberg",
"recon:huggingface",
"recon:replit",
"recon:codesandbox",
"recon:sandboxes",
"recon:kaggle",
// Phase 11
"recon:google",
"recon:bing",
"recon:duckduckgo",
"recon:yandex",
"recon:brave",
"recon:pastebin",
"recon:gistpaste",
"recon:pastesites",
// Phase 12: IoT scanners
"recon:shodan",
"recon:censys",
"recon:zoomeye",
"recon:fofa",
"recon:netlas",
"recon:binaryedge",
// Phase 12: Cloud storage
"recon:s3",
"recon:gcs",
"recon:azureblob",
"recon:spaces",
}
for _, st := range wantTypes {
if byType[st] == 0 {
t.Errorf("expected at least one finding with SourceType=%q, got none\nall findings: %+v", st, findings)
}
}
}
// TestRegisterAll_Phase12 verifies that RegisterAll correctly registers all 28
// sources (18 Phase 10-11 + 10 Phase 12) and that credential-gated sources
// report Enabled()==false when credentials are empty.
func TestRegisterAll_Phase12(t *testing.T) {
reg := providers.NewRegistryFromProviders([]providers.Provider{
{Name: "testprov", Keywords: []string{"test-key"}},
})
lim := recon.NewLimiterRegistry()
eng := recon.NewEngine()
RegisterAll(eng, SourcesConfig{
Registry: reg,
Limiters: lim,
// All credential fields left empty.
})
names := eng.List()
if n := len(names); n != 40 {
t.Fatalf("expected 40 sources from RegisterAll, got %d: %v", n, names)
}
// Build lookup for source access.
nameSet := make(map[string]bool, len(names))
for _, n := range names {
nameSet[n] = true
}
// All 10 Phase 12 sources must be present.
wantPhase12 := []string{
"shodan", "censys", "zoomeye", "fofa", "netlas", "binaryedge",
"s3", "gcs", "azureblob", "spaces",
}
for _, name := range wantPhase12 {
if !nameSet[name] {
t.Errorf("Phase 12 source %q not found in engine; registered: %v", name, names)
}
}
cfg := recon.Config{}
// IoT sources with empty credentials must be disabled.
iotSources := []string{"shodan", "censys", "zoomeye", "fofa", "netlas", "binaryedge"}
for _, name := range iotSources {
src, ok := eng.Get(name)
if !ok {
t.Errorf("source %q not found via Get", name)
continue
}
if src.Enabled(cfg) {
t.Errorf("IoT source %q should be Enabled()==false with empty credentials", name)
}
}
// Cloud storage sources (credentialless) must be enabled.
cloudSources := []string{"s3", "gcs", "azureblob", "spaces"}
for _, name := range cloudSources {
src, ok := eng.Get(name)
if !ok {
t.Errorf("source %q not found via Get", name)
continue
}
if !src.Enabled(cfg) {
t.Errorf("Cloud source %q should be Enabled()==true (credentialless)", name)
}
}
}
// TestRegisterAll_Phase12_SweepAllNoPanic verifies that SweepAll with a very
// short context timeout completes without panic when all 28 sources are
// registered with empty credentials.
func TestRegisterAll_Phase12_SweepAllNoPanic(t *testing.T) {
reg := providers.NewRegistryFromProviders([]providers.Provider{
{Name: "testprov", Keywords: []string{"test-key"}},
})
lim := recon.NewLimiterRegistry()
eng := recon.NewEngine()
RegisterAll(eng, SourcesConfig{
Registry: reg,
Limiters: lim,
})
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()
// Should not panic regardless of timeout or missing credentials.
_, _ = eng.SweepAll(ctx, recon.Config{})
}
// baseFromReq reconstructs the scheme+host of the inbound request so handlers
// can build absolute raw URLs pointing back at the same httptest server.
func baseFromReq(r *http.Request) string {
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return scheme + "://" + r.Host
}