diff --git a/.planning/phases/13-osint_package_registries_container_iac/13-01-SUMMARY.md b/.planning/phases/13-osint_package_registries_container_iac/13-01-SUMMARY.md new file mode 100644 index 0000000..2814a7c --- /dev/null +++ b/.planning/phases/13-osint_package_registries_container_iac/13-01-SUMMARY.md @@ -0,0 +1,106 @@ +--- +phase: 13-osint_package_registries_container_iac +plan: 01 +subsystem: recon +tags: [npm, pypi, crates.io, rubygems, package-registry, osint] + +requires: + - phase: 10-osint-code-hosting + provides: ReconSource interface, Client, BuildQueries, LimiterRegistry patterns +provides: + - NpmSource searching npm registry JSON API + - PyPISource scraping pypi.org search HTML + - CratesIOSource searching crates.io JSON API with custom User-Agent + - RubyGemsSource searching rubygems.org search.json API +affects: [13-osint_package_registries_container_iac, register.go] + +tech-stack: + added: [] + patterns: [JSON API source pattern, HTML scraping source pattern with extractAnchorHrefs reuse] + +key-files: + created: + - pkg/recon/sources/npm.go + - pkg/recon/sources/npm_test.go + - pkg/recon/sources/pypi.go + - pkg/recon/sources/pypi_test.go + - pkg/recon/sources/cratesio.go + - pkg/recon/sources/cratesio_test.go + - pkg/recon/sources/rubygems.go + - pkg/recon/sources/rubygems_test.go + modified: [] + +key-decisions: + - "PyPI uses HTML scraping with extractAnchorHrefs (reusing Replit pattern) since PyPI has no public search JSON API" + - "CratesIO sets custom User-Agent per crates.io API requirements" + +patterns-established: + - "Package registry source pattern: credentialless, JSON API search, bare keyword queries via BuildQueries" + +requirements-completed: [RECON-PKG-01, RECON-PKG-02] + +duration: 3min +completed: 2026-04-06 +--- + +# Phase 13 Plan 01: Package Registry Sources Summary + +**Four package registry ReconSources (npm, PyPI, crates.io, RubyGems) searching JS/Python/Rust/Ruby ecosystems for provider keyword matches** + +## Performance + +- **Duration:** 3 min +- **Started:** 2026-04-06T09:51:16Z +- **Completed:** 2026-04-06T09:54:00Z +- **Tasks:** 2 +- **Files modified:** 8 + +## Accomplishments +- NpmSource searches npm registry JSON API with 20-result pagination per keyword +- PyPISource scrapes pypi.org search HTML reusing extractAnchorHrefs from Replit pattern +- CratesIOSource queries crates.io JSON API with required custom User-Agent header +- RubyGemsSource queries rubygems.org search.json with fallback URL construction +- All four sources credentialless, rate-limited, context-aware with httptest test coverage + +## Task Commits + +Each task was committed atomically: + +1. **Task 1: Implement NpmSource and PyPISource** - `4b268d1` (feat) +2. **Task 2: Implement CratesIOSource and RubyGemsSource** - `9907e24` (feat) + +## Files Created/Modified +- `pkg/recon/sources/npm.go` - NpmSource searching npm registry JSON API +- `pkg/recon/sources/npm_test.go` - httptest tests for NpmSource (4 tests) +- `pkg/recon/sources/pypi.go` - PyPISource scraping pypi.org search HTML +- `pkg/recon/sources/pypi_test.go` - httptest tests for PyPISource (4 tests) +- `pkg/recon/sources/cratesio.go` - CratesIOSource with custom User-Agent +- `pkg/recon/sources/cratesio_test.go` - httptest tests verifying User-Agent header (4 tests) +- `pkg/recon/sources/rubygems.go` - RubyGemsSource searching rubygems.org JSON API +- `pkg/recon/sources/rubygems_test.go` - httptest tests for RubyGemsSource (4 tests) + +## Decisions Made +- PyPI uses HTML scraping with extractAnchorHrefs (reusing Replit pattern) since PyPI has no public search JSON API +- CratesIO sets custom User-Agent header per crates.io API policy requirements +- All sources use bare keyword queries via BuildQueries default path + +## Deviations from Plan + +None - plan executed exactly as written. + +## Issues Encountered +None + +## User Setup Required +None - no external service configuration required. + +## Known Stubs +None - all sources fully wired with real API endpoints and functional Sweep implementations. + +## Next Phase Readiness +- Four package registry sources ready for RegisterAll wiring +- Pattern established for remaining registry sources (Maven, NuGet, GoProxy) + +--- +*Phase: 13-osint_package_registries_container_iac* +*Completed: 2026-04-06* diff --git a/pkg/recon/sources/cratesio.go b/pkg/recon/sources/cratesio.go new file mode 100644 index 0000000..926c62c --- /dev/null +++ b/pkg/recon/sources/cratesio.go @@ -0,0 +1,108 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CratesIOSource searches crates.io for crates matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:crates. +// +// crates.io requires a custom User-Agent header on all requests. +type CratesIOSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*CratesIOSource)(nil) + +// crates.io search JSON response structs. +type cratesSearchResponse struct { + Crates []crateEntry `json:"crates"` +} + +type crateEntry struct { + ID string `json:"id"` + Name string `json:"name"` + Repository string `json:"repository"` +} + +func (s *CratesIOSource) Name() string { return "crates" } +func (s *CratesIOSource) RateLimit() rate.Limit { return rate.Every(1 * time.Second) } +func (s *CratesIOSource) Burst() int { return 1 } +func (s *CratesIOSource) RespectsRobots() bool { return false } +func (s *CratesIOSource) Enabled(_ recon.Config) bool { return true } + +func (s *CratesIOSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://crates.io" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "crates") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/api/v1/crates?q=%s&per_page=20", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("crates: build req: %w", err) + } + // crates.io requires a descriptive User-Agent header. + req.Header.Set("User-Agent", "keyhunter-recon/1.0 (https://github.com/salvacybersec/keyhunter)") + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("crates: fetch: %w", err) + } + + var result cratesSearchResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("crates: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, c := range result.Crates { + if err := ctx.Err(); err != nil { + return err + } + source := fmt.Sprintf("https://crates.io/crates/%s", c.Name) + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:crates", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/cratesio_test.go b/pkg/recon/sources/cratesio_test.go new file mode 100644 index 0000000..50a097f --- /dev/null +++ b/pkg/recon/sources/cratesio_test.go @@ -0,0 +1,137 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func cratesTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const cratesFixtureJSON = `{ + "crates": [ + {"id": "openai-rs", "name": "openai-rs", "repository": "https://github.com/example/openai-rs"}, + {"id": "sk-proj-crate", "name": "sk-proj-crate", "repository": ""} + ] +}` + +func newCratesIOTestSource(srvURL string) *CratesIOSource { + return &CratesIOSource{ + BaseURL: srvURL, + Registry: cratesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestCratesIO_Sweep_ExtractsFindings(t *testing.T) { + var hits int + var gotUserAgent string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/crates" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + gotUserAgent = r.Header.Get("User-Agent") + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(cratesFixtureJSON)) + })) + defer srv.Close() + + src := newCratesIOTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:crates" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://crates.io/crates/openai-rs"] { + t.Error("missing openai-rs finding") + } + if !got["https://crates.io/crates/sk-proj-crate"] { + t.Error("missing sk-proj-crate finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } + + // Verify custom User-Agent header. + if gotUserAgent != "keyhunter-recon/1.0 (https://github.com/salvacybersec/keyhunter)" { + t.Errorf("unexpected User-Agent: %s", gotUserAgent) + } +} + +func TestCratesIO_EnabledAlwaysTrue(t *testing.T) { + s := &CratesIOSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestCratesIO_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(cratesFixtureJSON)) + })) + defer srv.Close() + + src := newCratesIOTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestCratesIO_NameAndRate(t *testing.T) { + s := &CratesIOSource{} + if s.Name() != "crates" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 1 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +} diff --git a/pkg/recon/sources/npm.go b/pkg/recon/sources/npm.go new file mode 100644 index 0000000..44d2542 --- /dev/null +++ b/pkg/recon/sources/npm.go @@ -0,0 +1,114 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// NpmSource searches the npm registry for packages matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:npm. +type NpmSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*NpmSource)(nil) + +func (s *NpmSource) Name() string { return "npm" } +func (s *NpmSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *NpmSource) Burst() int { return 2 } +func (s *NpmSource) RespectsRobots() bool { return false } +func (s *NpmSource) Enabled(_ recon.Config) bool { return true } + +// npm search JSON response structs. +type npmSearchResponse struct { + Objects []npmObject `json:"objects"` +} + +type npmObject struct { + Package npmPackage `json:"package"` +} + +type npmPackage struct { + Name string `json:"name"` + Links npmLinks `json:"links"` +} + +type npmLinks struct { + Npm string `json:"npm"` +} + +func (s *NpmSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://registry.npmjs.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "npm") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/-/v1/search?text=%s&size=20", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("npm: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("npm: fetch: %w", err) + } + + var result npmSearchResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("npm: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, obj := range result.Objects { + if err := ctx.Err(); err != nil { + return err + } + source := obj.Package.Links.Npm + if source == "" { + source = fmt.Sprintf("https://www.npmjs.com/package/%s", obj.Package.Name) + } + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:npm", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/npm_test.go b/pkg/recon/sources/npm_test.go new file mode 100644 index 0000000..e8e9db4 --- /dev/null +++ b/pkg/recon/sources/npm_test.go @@ -0,0 +1,141 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func npmTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const npmFixtureJSON = `{ + "objects": [ + { + "package": { + "name": "openai-key-checker", + "links": {"npm": "https://www.npmjs.com/package/openai-key-checker"} + } + }, + { + "package": { + "name": "sk-proj-util", + "links": {"npm": ""} + } + } + ] +}` + +func newNpmTestSource(srvURL string) *NpmSource { + return &NpmSource{ + BaseURL: srvURL, + Registry: npmTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestNpm_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/-/v1/search" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("text") == "" { + t.Errorf("missing text param") + } + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(npmFixtureJSON)) + })) + defer srv.Close() + + src := newNpmTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:npm" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://www.npmjs.com/package/openai-key-checker"] { + t.Error("missing finding with npm link") + } + // Second package has empty links.npm — should get constructed URL. + if !got["https://www.npmjs.com/package/sk-proj-util"] { + t.Error("missing finding with constructed URL") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestNpm_EnabledAlwaysTrue(t *testing.T) { + s := &NpmSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestNpm_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(npmFixtureJSON)) + })) + defer srv.Close() + + src := newNpmTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestNpm_NameAndRate(t *testing.T) { + s := &NpmSource{} + if s.Name() != "npm" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +} diff --git a/pkg/recon/sources/pypi.go b/pkg/recon/sources/pypi.go new file mode 100644 index 0000000..1451e6b --- /dev/null +++ b/pkg/recon/sources/pypi.go @@ -0,0 +1,102 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// PyPISource searches pypi.org for packages matching provider keywords. +// Scrapes the HTML search page since PyPI has no public search JSON API. +// No credentials required. Emits findings tagged SourceType=recon:pypi. +type PyPISource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*PyPISource)(nil) + +// pypiProjectRE matches /project/{name}/ hrefs in search results. +var pypiProjectRE = regexp.MustCompile(`^/project/[^/]+/?$`) + +func (s *PyPISource) Name() string { return "pypi" } +func (s *PyPISource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *PyPISource) Burst() int { return 2 } +func (s *PyPISource) RespectsRobots() bool { return false } +func (s *PyPISource) Enabled(_ recon.Config) bool { return true } + +func (s *PyPISource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://pypi.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "pypi") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search/?q=%s", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("pypi: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("pypi: fetch: %w", err) + } + + hrefs, err := extractPyPIProjectLinks(resp.Body) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("pypi: parse html: %w", err) + } + + for _, href := range hrefs { + if err := ctx.Err(); err != nil { + return err + } + absURL := base + href + out <- recon.Finding{ + ProviderName: "", + Source: absURL, + SourceType: "recon:pypi", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} + +// extractPyPIProjectLinks extracts unique /project/{name}/ hrefs from HTML. +func extractPyPIProjectLinks(body io.Reader) ([]string, error) { + return extractAnchorHrefs(body, pypiProjectRE) +} diff --git a/pkg/recon/sources/pypi_test.go b/pkg/recon/sources/pypi_test.go new file mode 100644 index 0000000..f38c162 --- /dev/null +++ b/pkg/recon/sources/pypi_test.go @@ -0,0 +1,133 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func pypiTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const pypiFixtureHTML = ` +
+ openai-leaked + sk helper + nope + external + duplicate +` + +func newPyPITestSource(srvURL string) *PyPISource { + return &PyPISource{ + BaseURL: srvURL, + Registry: pypiTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestPyPI_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search/" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + hits++ + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(pypiFixtureHTML)) + })) + defer srv.Close() + + src := newPyPITestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // 2 unique /project/ links (duplicate is deduped by extractAnchorHrefs) + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:pypi" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got[srv.URL+"/project/openai-leaked/"] { + t.Error("missing openai-leaked finding") + } + if !got[srv.URL+"/project/sk-proj-helper/"] { + t.Error("missing sk-proj-helper finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestPyPI_EnabledAlwaysTrue(t *testing.T) { + s := &PyPISource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestPyPI_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(pypiFixtureHTML)) + })) + defer srv.Close() + + src := newPyPITestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestPyPI_NameAndRate(t *testing.T) { + s := &PyPISource{} + if s.Name() != "pypi" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +} diff --git a/pkg/recon/sources/rubygems.go b/pkg/recon/sources/rubygems.go new file mode 100644 index 0000000..3df5736 --- /dev/null +++ b/pkg/recon/sources/rubygems.go @@ -0,0 +1,102 @@ +package sources + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// RubyGemsSource searches rubygems.org for gems matching provider keywords. +// No credentials required. Emits findings tagged SourceType=recon:rubygems. +type RubyGemsSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +var _ recon.ReconSource = (*RubyGemsSource)(nil) + +// rubyGemEntry represents one entry in the RubyGems search JSON array. +type rubyGemEntry struct { + Name string `json:"name"` + ProjectURI string `json:"project_uri"` +} + +func (s *RubyGemsSource) Name() string { return "rubygems" } +func (s *RubyGemsSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) } +func (s *RubyGemsSource) Burst() int { return 2 } +func (s *RubyGemsSource) RespectsRobots() bool { return false } +func (s *RubyGemsSource) Enabled(_ recon.Config) bool { return true } + +func (s *RubyGemsSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://rubygems.org" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "rubygems") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/api/v1/search.json?query=%s&page=1", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("rubygems: build req: %w", err) + } + + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("rubygems: fetch: %w", err) + } + + var gems []rubyGemEntry + if err := json.NewDecoder(resp.Body).Decode(&gems); err != nil { + _ = resp.Body.Close() + return fmt.Errorf("rubygems: decode json: %w", err) + } + _ = resp.Body.Close() + + for _, g := range gems { + if err := ctx.Err(); err != nil { + return err + } + source := g.ProjectURI + if source == "" { + source = fmt.Sprintf("https://rubygems.org/gems/%s", g.Name) + } + out <- recon.Finding{ + ProviderName: "", + Source: source, + SourceType: "recon:rubygems", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/rubygems_test.go b/pkg/recon/sources/rubygems_test.go new file mode 100644 index 0000000..c930cfe --- /dev/null +++ b/pkg/recon/sources/rubygems_test.go @@ -0,0 +1,129 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func rubygemsTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const rubygemsFixtureJSON = `[ + {"name": "openai-ruby", "project_uri": "https://rubygems.org/gems/openai-ruby"}, + {"name": "sk-proj-gem", "project_uri": ""} +]` + +func newRubyGemsTestSource(srvURL string) *RubyGemsSource { + return &RubyGemsSource{ + BaseURL: srvURL, + Registry: rubygemsTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestRubyGems_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/v1/search.json" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("query") == "" { + t.Errorf("missing query param") + } + hits++ + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(rubygemsFixtureJSON)) + })) + defer srv.Close() + + src := newRubyGemsTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:rubygems" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + if f.Confidence != "low" { + t.Errorf("unexpected Confidence: %s", f.Confidence) + } + } + if !got["https://rubygems.org/gems/openai-ruby"] { + t.Error("missing openai-ruby finding") + } + // Second gem has empty project_uri — should get constructed URL. + if !got["https://rubygems.org/gems/sk-proj-gem"] { + t.Error("missing sk-proj-gem finding") + } + if hits == 0 { + t.Fatal("server was never hit") + } +} + +func TestRubyGems_EnabledAlwaysTrue(t *testing.T) { + s := &RubyGemsSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestRubyGems_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(rubygemsFixtureJSON)) + })) + defer srv.Close() + + src := newRubyGemsTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestRubyGems_NameAndRate(t *testing.T) { + s := &RubyGemsSource{} + if s.Name() != "rubygems" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 2 { + t.Errorf("burst: %d", s.Burst()) + } + if s.RespectsRobots() { + t.Error("expected RespectsRobots=false") + } + want := float64(1) / 2 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +}