diff --git a/pkg/recon/sources/npm.go b/pkg/recon/sources/npm.go new file mode 100644 index 0000000..44d2542 --- /dev/null +++ b/pkg/recon/sources/npm.go @@ -0,0 +1,114 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// NpmSource searches the npm registry for packages matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:npm. +type NpmSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*NpmSource)(nil) + +func (s *NpmSource) Name() string { return "npm" } +func (s *NpmSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *NpmSource) Burst() int { return 2 } +func (s *NpmSource) RespectsRobots() bool { return false } +func (s *NpmSource) Enabled(_ recon.Config) bool { return true } + +// npm search JSON response structs. +type npmSearchResponse struct { + Objects []npmObject `json:"objects"` +} + +type npmObject struct { + Package npmPackage `json:"package"` +} + +type npmPackage struct { + Name string `json:"name"` + Links npmLinks `json:"links"` +} + +type npmLinks struct { + Npm string `json:"npm"` +} + +func (s *NpmSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://registry.npmjs.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "npm") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/-/v1/search?text=%s&size=20", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("npm: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("npm: fetch: %w", err) + } + + var result npmSearchResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("npm: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, obj := range result.Objects { + if err := ctx.Err(); err != nil { + return err + } + source := obj.Package.Links.Npm + if source == "" { + source = fmt.Sprintf("https://www.npmjs.com/package/%s", obj.Package.Name) + } + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:npm", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/npm_test.go b/pkg/recon/sources/npm_test.go new file mode 100644 index 0000000..e8e9db4 --- /dev/null +++ b/pkg/recon/sources/npm_test.go @@ -0,0 +1,141 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func npmTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const npmFixtureJSON = `{ + "objects": [ + { + "package": { + "name": "openai-key-checker", + "links": {"npm": "https://www.npmjs.com/package/openai-key-checker"} + } + }, + { + "package": { + "name": "sk-proj-util", + "links": {"npm": ""} + } + } + ] +}` + +func newNpmTestSource(srvURL string) *NpmSource { + return &NpmSource{ + BaseURL: srvURL, + Registry: npmTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestNpm_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/-/v1/search" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("text") == "" { + t.Errorf("missing text param") + } + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(npmFixtureJSON)) + })) + defer srv.Close() + + src := newNpmTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:npm" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://www.npmjs.com/package/openai-key-checker"] { + t.Error("missing finding with npm link") + } + // Second package has empty links.npm — should get constructed URL. + if !got["https://www.npmjs.com/package/sk-proj-util"] { + t.Error("missing finding with constructed URL") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestNpm_EnabledAlwaysTrue(t *testing.T) { + s := &NpmSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestNpm_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(npmFixtureJSON)) + })) + defer srv.Close() + + src := newNpmTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestNpm_NameAndRate(t *testing.T) { + s := &NpmSource{} + if s.Name() != "npm" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +} diff --git a/pkg/recon/sources/pypi.go b/pkg/recon/sources/pypi.go new file mode 100644 index 0000000..1451e6b --- /dev/null +++ b/pkg/recon/sources/pypi.go @@ -0,0 +1,102 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// PyPISource searches pypi.org for packages matching provider keywords. +// Scrapes the HTML search page since PyPI has no public search JSON API. +// No credentials required. Emits findings tagged SourceType=recon:pypi. +type PyPISource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*PyPISource)(nil) + +// pypiProjectRE matches /project/{name}/ hrefs in search results. +var pypiProjectRE = regexp.MustCompile(`^/project/[^/]+/?$`) + +func (s *PyPISource) Name() string { return "pypi" } +func (s *PyPISource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *PyPISource) Burst() int { return 2 } +func (s *PyPISource) RespectsRobots() bool { return false } +func (s *PyPISource) Enabled(_ recon.Config) bool { return true } + +func (s *PyPISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://pypi.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pypi") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search/?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("pypi: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("pypi: fetch: %w", err) + } + + hrefs, err := extractPyPIProjectLinks(resp.Body) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("pypi: parse html: %w", err) + } + + for _, href := range hrefs { + if err := ctx.Err(); err != nil { + return err + } + absURL := base + href + out <- recon.Finding{ + ProviderName: "", + Source: absURL, + SourceType: "recon:pypi", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} + +// extractPyPIProjectLinks extracts unique /project/{name}/ hrefs from HTML. +func extractPyPIProjectLinks(body io.Reader) ([]string, error) { + return extractAnchorHrefs(body, pypiProjectRE) +} diff --git a/pkg/recon/sources/pypi_test.go b/pkg/recon/sources/pypi_test.go new file mode 100644 index 0000000..f38c162 --- /dev/null +++ b/pkg/recon/sources/pypi_test.go @@ -0,0 +1,133 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pypiTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const pypiFixtureHTML = ` +
+ openai-leaked + sk helper + nope + external + duplicate +` + +func newPyPITestSource(srvURL string) *PyPISource { + return &PyPISource{ + BaseURL: srvURL, + Registry: pypiTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestPyPI_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search/" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + hits++ + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pypiFixtureHTML)) + })) + defer srv.Close() + + src := newPyPITestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // 2 unique /project/ links (duplicate is deduped by extractAnchorHrefs) + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:pypi" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got[srv.URL+"/project/openai-leaked/"] { + t.Error("missing openai-leaked finding") + } + if !got[srv.URL+"/project/sk-proj-helper/"] { + t.Error("missing sk-proj-helper finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestPyPI_EnabledAlwaysTrue(t *testing.T) { + s := &PyPISource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestPyPI_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(pypiFixtureHTML)) + })) + defer srv.Close() + + src := newPyPITestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestPyPI_NameAndRate(t *testing.T) { + s := &PyPISource{} + if s.Name() != "pypi" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +}