From 9907e2497a74b1368d2e881ff80ec3c7daa1f77d Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 12:53:41 +0300 Subject: [PATCH] feat(13-01): implement CratesIOSource and RubyGemsSource with httptest tests - CratesIOSource searches crates.io JSON API with custom User-Agent header - RubyGemsSource searches rubygems.org search.json API for gem matches - Both credentialless; CratesIO 1 req/s burst 1, RubyGems 1 req/2s burst 2 - Tests verify User-Agent header, Sweep findings, ctx cancellation, metadata --- pkg/recon/sources/cratesio.go | 108 +++++++++++++++++++++++ pkg/recon/sources/cratesio_test.go | 137 +++++++++++++++++++++++++++++ pkg/recon/sources/rubygems.go | 102 +++++++++++++++++++++ pkg/recon/sources/rubygems_test.go | 129 +++++++++++++++++++++++++++ 4 files changed, 476 insertions(+) create mode 100644 pkg/recon/sources/cratesio.go create mode 100644 pkg/recon/sources/cratesio_test.go create mode 100644 pkg/recon/sources/rubygems.go create mode 100644 pkg/recon/sources/rubygems_test.go diff --git a/pkg/recon/sources/cratesio.go b/pkg/recon/sources/cratesio.go new file mode 100644 index 0000000..926c62c --- /dev/null +++ b/pkg/recon/sources/cratesio.go @@ -0,0 +1,108 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CratesIOSource searches crates.io for crates matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:crates. +// +// crates.io requires a custom User-Agent header on all requests. +type CratesIOSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*CratesIOSource)(nil) + +// crates.io search JSON response structs. +type cratesSearchResponse struct { + Crates []crateEntry `json:"crates"` +} + +type crateEntry struct { + ID string `json:"id"` + Name string `json:"name"` + Repository string `json:"repository"` +} + +func (s *CratesIOSource) Name() string { return "crates" } +func (s *CratesIOSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } +func (s *CratesIOSource) Burst() int { return 1 } +func (s *CratesIOSource) RespectsRobots() bool { return false } +func (s *CratesIOSource) Enabled(_ recon.Config) bool { return true } + +func (s *CratesIOSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://crates.io" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "crates") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/api/v1/crates?q=%s&per_page=20", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("crates: build req: %w", err) + } + // crates.io requires a descriptive User-Agent header. + req.Header.Set("User-Agent", "keyhunter-recon/1.0 (https://github.com/salvacybersec/keyhunter)") + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("crates: fetch: %w", err) + } + + var result cratesSearchResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("crates: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, c := range result.Crates { + if err := ctx.Err(); err != nil { + return err + } + source := fmt.Sprintf("https://crates.io/crates/%s", c.Name) + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:crates", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/cratesio_test.go b/pkg/recon/sources/cratesio_test.go new file mode 100644 index 0000000..50a097f --- /dev/null +++ b/pkg/recon/sources/cratesio_test.go @@ -0,0 +1,137 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func cratesTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const cratesFixtureJSON = `{ + "crates": [ + {"id": "openai-rs", "name": "openai-rs", "repository": "https://github.com/example/openai-rs"}, + {"id": "sk-proj-crate", "name": "sk-proj-crate", "repository": ""} + ] +}` + +func newCratesIOTestSource(srvURL string) *CratesIOSource { + return &CratesIOSource{ + BaseURL: srvURL, + Registry: cratesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestCratesIO_Sweep_ExtractsFindings(t *testing.T) { + var hits int + var gotUserAgent string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/crates" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + gotUserAgent = r.Header.Get("User-Agent") + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(cratesFixtureJSON)) + })) + defer srv.Close() + + src := newCratesIOTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:crates" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://crates.io/crates/openai-rs"] { + t.Error("missing openai-rs finding") + } + if !got["https://crates.io/crates/sk-proj-crate"] { + t.Error("missing sk-proj-crate finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } + + // Verify custom User-Agent header. + if gotUserAgent != "keyhunter-recon/1.0 (https://github.com/salvacybersec/keyhunter)" { + t.Errorf("unexpected User-Agent: %s", gotUserAgent) + } +} + +func TestCratesIO_EnabledAlwaysTrue(t *testing.T) { + s := &CratesIOSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestCratesIO_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(cratesFixtureJSON)) + })) + defer srv.Close() + + src := newCratesIOTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestCratesIO_NameAndRate(t *testing.T) { + s := &CratesIOSource{} + if s.Name() != "crates" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 1 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +} diff --git a/pkg/recon/sources/rubygems.go b/pkg/recon/sources/rubygems.go new file mode 100644 index 0000000..3df5736 --- /dev/null +++ b/pkg/recon/sources/rubygems.go @@ -0,0 +1,102 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// RubyGemsSource searches rubygems.org for gems matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:rubygems. +type RubyGemsSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*RubyGemsSource)(nil) + +// rubyGemEntry represents one entry in the RubyGems search JSON array. +type rubyGemEntry struct { + Name string `json:"name"` + ProjectURI string `json:"project_uri"` +} + +func (s *RubyGemsSource) Name() string { return "rubygems" } +func (s *RubyGemsSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *RubyGemsSource) Burst() int { return 2 } +func (s *RubyGemsSource) RespectsRobots() bool { return false } +func (s *RubyGemsSource) Enabled(_ recon.Config) bool { return true } + +func (s *RubyGemsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://rubygems.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "rubygems") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/api/v1/search.json?query=%s&page=1", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("rubygems: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("rubygems: fetch: %w", err) + } + + var gems []rubyGemEntry + if err := json.NewDecoder(resp.Body).Decode(&gems); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("rubygems: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, g := range gems { + if err := ctx.Err(); err != nil { + return err + } + source := g.ProjectURI + if source == "" { + source = fmt.Sprintf("https://rubygems.org/gems/%s", g.Name) + } + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:rubygems", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/rubygems_test.go b/pkg/recon/sources/rubygems_test.go new file mode 100644 index 0000000..c930cfe --- /dev/null +++ b/pkg/recon/sources/rubygems_test.go @@ -0,0 +1,129 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func rubygemsTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const rubygemsFixtureJSON = `[ + {"name": "openai-ruby", "project_uri": "https://rubygems.org/gems/openai-ruby"}, + {"name": "sk-proj-gem", "project_uri": ""} +]` + +func newRubyGemsTestSource(srvURL string) *RubyGemsSource { + return &RubyGemsSource{ + BaseURL: srvURL, + Registry: rubygemsTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestRubyGems_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/search.json" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("query") == "" { + t.Errorf("missing query param") + } + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(rubygemsFixtureJSON)) + })) + defer srv.Close() + + src := newRubyGemsTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:rubygems" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://rubygems.org/gems/openai-ruby"] { + t.Error("missing openai-ruby finding") + } + // Second gem has empty project_uri — should get constructed URL. + if !got["https://rubygems.org/gems/sk-proj-gem"] { + t.Error("missing sk-proj-gem finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestRubyGems_EnabledAlwaysTrue(t *testing.T) { + s := &RubyGemsSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestRubyGems_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(rubygemsFixtureJSON)) + })) + defer srv.Close() + + src := newRubyGemsTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestRubyGems_NameAndRate(t *testing.T) { + s := &RubyGemsSource{} + if s.Name() != "rubygems" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +}