diff --git a/pkg/recon/sources/goproxy.go b/pkg/recon/sources/goproxy.go new file mode 100644 index 0000000..76eac1a --- /dev/null +++ b/pkg/recon/sources/goproxy.go @@ -0,0 +1,104 @@ +package sources + +import ( + "context" + "fmt" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// GoProxySource searches pkg.go.dev for Go modules matching provider keywords. +// pkg.go.dev returns HTML search results, so we parse anchor hrefs for module +// paths. No authentication required. +type GoProxySource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// goProxyLinkRE matches Go module paths which always contain a domain with a +// dot (e.g. /github.com/user/repo). This filters out simple paths like /about. +var goProxyLinkRE = regexp.MustCompile(`^/[a-z][a-z0-9_-]*\.[a-z0-9./_-]+$`) + +// Compile-time assertion that GoProxySource satisfies recon.ReconSource. +var _ recon.ReconSource = (*GoProxySource)(nil) + +func (s *GoProxySource) Name() string { return "goproxy" } +func (s *GoProxySource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *GoProxySource) Burst() int { return 2 } +func (s *GoProxySource) RespectsRobots() bool { return false } + +// Enabled always returns true: pkg.go.dev requires no credentials. +func (s *GoProxySource) Enabled(_ recon.Config) bool { return true } + +// Sweep queries pkg.go.dev search for each provider keyword, parses the HTML +// response for module path links, and emits a Finding per result. +func (s *GoProxySource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://pkg.go.dev" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "goproxy") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/search?q=%s&m=package", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("goproxy: build request: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + links, parseErr := extractAnchorHrefs(resp.Body, goProxyLinkRE) + _ = resp.Body.Close() + if parseErr != nil { + continue + } + + for _, href := range links { + if err := ctx.Err(); err != nil { + return err + } + absURL := base + href + select { + case out <- recon.Finding{ + Source: absURL, + SourceType: "recon:goproxy", + Confidence: "low", + DetectedAt: time.Now(), + }: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} diff --git a/pkg/recon/sources/goproxy_test.go b/pkg/recon/sources/goproxy_test.go new file mode 100644 index 0000000..b36bc79 --- /dev/null +++ b/pkg/recon/sources/goproxy_test.go @@ -0,0 +1,124 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func goProxyTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const goProxyFixtureHTML = ` + + openai-go + llm-client + about page + external + pagination +` + +func newGoProxyTestSource(srvURL string) *GoProxySource { + return &GoProxySource{ + BaseURL: srvURL, + Registry: goProxyTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestGoProxy_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + hits++ + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(goProxyFixtureHTML)) + })) + defer srv.Close() + + src := newGoProxyTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // Should match the two Go module paths, not /about, /search, or external links + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + want1 := srv.URL + "/github.com/example/openai-go" + want2 := srv.URL + "/github.com/test/llm-client" + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:goproxy" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + } + if !got[want1] || !got[want2] { + t.Fatalf("missing expected sources; got=%v", got) + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestGoProxy_NameAndRate(t *testing.T) { + s := &GoProxySource{} + if s.Name() != "goproxy" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } +} + +func TestGoProxy_EnabledAlwaysTrue(t *testing.T) { + s := &GoProxySource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestGoProxy_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(goProxyFixtureHTML)) + })) + defer srv.Close() + + src := newGoProxyTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} diff --git a/pkg/recon/sources/packagist.go b/pkg/recon/sources/packagist.go new file mode 100644 index 0000000..01c5d98 --- /dev/null +++ b/pkg/recon/sources/packagist.go @@ -0,0 +1,111 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// PackagistSource searches Packagist (the PHP package registry) for packages +// matching provider keywords. The Packagist search API is public JSON and +// requires no authentication. +type PackagistSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// Compile-time assertion that PackagistSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*PackagistSource)(nil) + +func (s *PackagistSource) Name() string { return "packagist" } +func (s *PackagistSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *PackagistSource) Burst() int { return 2 } +func (s *PackagistSource) RespectsRobots() bool { return false } + +// Enabled always returns true: Packagist search requires no credentials. +func (s *PackagistSource) Enabled(_ recon.Config) bool { return true } + +// Sweep queries Packagist's search API for each provider keyword and emits a +// Finding per matching package. +func (s *PackagistSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://packagist.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "packagist") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + endpoint := fmt.Sprintf("%s/search.json?q=%s&per_page=20", + base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return fmt.Errorf("packagist: build request: %w", err) + } + req.Header.Set("Accept", "application/json") + + resp, err := client.Do(ctx, req) + if err != nil { + continue + } + + var parsed packagistSearchResponse + decErr := json.NewDecoder(resp.Body).Decode(&parsed) + _ = resp.Body.Close() + if decErr != nil { + continue + } + + for _, pkg := range parsed.Results { + if err := ctx.Err(); err != nil { + return err + } + select { + case out <- recon.Finding{ + Source: pkg.URL, + SourceType: "recon:packagist", + Confidence: "low", + DetectedAt: time.Now(), + }: + case <-ctx.Done(): + return ctx.Err() + } + } + } + return nil +} + +type packagistSearchResponse struct { + Results []packagistPackage `json:"results"` +} + +type packagistPackage struct { + Name string `json:"name"` + URL string `json:"url"` +} diff --git a/pkg/recon/sources/packagist_test.go b/pkg/recon/sources/packagist_test.go new file mode 100644 index 0000000..671c7b4 --- /dev/null +++ b/pkg/recon/sources/packagist_test.go @@ -0,0 +1,121 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func packagistTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const packagistFixtureJSON = `{ + "results": [ + {"name": "vendor/openai-php", "url": "https://packagist.org/packages/vendor/openai-php"}, + {"name": "other/llm-sdk", "url": "https://packagist.org/packages/other/llm-sdk"} + ] +}` + +func newPackagistTestSource(srvURL string) *PackagistSource { + return &PackagistSource{ + BaseURL: srvURL, + Registry: packagistTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestPackagist_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search.json" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(packagistFixtureJSON)) + })) + defer srv.Close() + + src := newPackagistTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + want1 := "https://packagist.org/packages/vendor/openai-php" + want2 := "https://packagist.org/packages/other/llm-sdk" + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:packagist" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + } + if !got[want1] || !got[want2] { + t.Fatalf("missing expected sources; got=%v", got) + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestPackagist_NameAndRate(t *testing.T) { + s := &PackagistSource{} + if s.Name() != "packagist" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } +} + +func TestPackagist_EnabledAlwaysTrue(t *testing.T) { + s := &PackagistSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestPackagist_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(packagistFixtureJSON)) + })) + defer srv.Close() + + src := newPackagistTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +}