From 62a347f4767bc61fac906fb58944e416589d8117 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:16:39 +0300 Subject: [PATCH] feat(10-07): add Replit and CodeSandbox scraping sources - ReplitSource scrapes /search HTML extracting /@user/repl anchors - CodeSandboxSource scrapes /search HTML extracting /s/slug anchors - Both use golang.org/x/net/html parser, 10 req/min rate, RespectsRobots=true - 10 httptest-backed tests covering extraction, ctx cancel, rate/name assertions --- pkg/recon/sources/codesandbox.go | 95 +++++++++++++++++ pkg/recon/sources/codesandbox_test.go | 109 ++++++++++++++++++++ pkg/recon/sources/replit.go | 141 ++++++++++++++++++++++++++ pkg/recon/sources/replit_test.go | 131 ++++++++++++++++++++++++ 4 files changed, 476 insertions(+) create mode 100644 pkg/recon/sources/codesandbox.go create mode 100644 pkg/recon/sources/codesandbox_test.go create mode 100644 pkg/recon/sources/replit.go create mode 100644 pkg/recon/sources/replit_test.go diff --git a/pkg/recon/sources/codesandbox.go b/pkg/recon/sources/codesandbox.go new file mode 100644 index 0000000..bb617ac --- /dev/null +++ b/pkg/recon/sources/codesandbox.go @@ -0,0 +1,95 @@ +package sources + +import ( + "context" + "fmt" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// CodeSandboxSource scrapes codesandbox.io search results for references to +// provider keywords. Mirrors ReplitSource: robots-respecting, 10 req/min, +// no credentials required. +type CodeSandboxSource struct { + BaseURL string + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client +} + +// codeSandboxLinkRE matches /s/ sandbox result links. +var codeSandboxLinkRE = regexp.MustCompile(`^/s/[a-zA-Z0-9-]+$`) + +// Compile-time assertion that CodeSandboxSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*CodeSandboxSource)(nil) + +func (s *CodeSandboxSource) Name() string { return "codesandbox" } +func (s *CodeSandboxSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) } +func (s *CodeSandboxSource) Burst() int { return 1 } +func (s *CodeSandboxSource) RespectsRobots() bool { return true } +func (s *CodeSandboxSource) Enabled(_ recon.Config) bool { return true } + +// Sweep runs a CodeSandbox search per provider keyword and emits one Finding +// per matched result anchor. +func (s *CodeSandboxSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://codesandbox.io" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "codesandbox") + if len(queries) == 0 { + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search?query=%s&type=sandboxes", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("codesandbox: build req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("codesandbox: fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, codeSandboxLinkRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("codesandbox: parse html: %w", err) + } + + for _, href := range links { + if err := ctx.Err(); err != nil { + return err + } + out <- recon.Finding{ + Source: base + href, + SourceType: "recon:codesandbox", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} diff --git a/pkg/recon/sources/codesandbox_test.go b/pkg/recon/sources/codesandbox_test.go new file mode 100644 index 0000000..b80e680 --- /dev/null +++ b/pkg/recon/sources/codesandbox_test.go @@ -0,0 +1,109 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func codesandboxTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "anthropic", Keywords: []string{"sk-ant-"}}, + }) +} + +const codesandboxFixtureHTML = ` + + one + two + skip + skip deeper +` + +func newCodeSandboxTestSource(srvURL string) *CodeSandboxSource { + return &CodeSandboxSource{ + BaseURL: srvURL, + Registry: codesandboxTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestCodeSandbox_Sweep_ExtractsFindings(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("query") == "" { + t.Errorf("missing query param") + } + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(codesandboxFixtureHTML)) + })) + defer srv.Close() + + src := newCodeSandboxTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:codesandbox" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + } +} + +func TestCodeSandbox_RespectsRobots(t *testing.T) { + s := &CodeSandboxSource{} + if !s.RespectsRobots() { + t.Fatal("expected RespectsRobots=true") + } +} + +func TestCodeSandbox_EnabledAlwaysTrue(t *testing.T) { + s := &CodeSandboxSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestCodeSandbox_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + })) + defer srv.Close() + + src := newCodeSandboxTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestCodeSandbox_NameAndRate(t *testing.T) { + s := &CodeSandboxSource{} + if s.Name() != "codesandbox" { + t.Fatalf("unexpected name: %s", s.Name()) + } +} diff --git a/pkg/recon/sources/replit.go b/pkg/recon/sources/replit.go new file mode 100644 index 0000000..deab8d2 --- /dev/null +++ b/pkg/recon/sources/replit.go @@ -0,0 +1,141 @@ +package sources + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "time" + + "golang.org/x/net/html" + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// ReplitSource scrapes replit.com search HTML for references to provider +// keywords. Replit has no public search API, so we honor robots.txt and run +// at a very conservative 10 req/min rate. +// +// Emits one Finding per extracted result link, tagged SourceType=recon:replit. +type ReplitSource struct { + // BaseURL defaults to https://replit.com. Tests override with httptest URL. + BaseURL string + // Registry drives the keyword query list via BuildQueries. + Registry *providers.Registry + // Limiters is the shared recon.LimiterRegistry used to coordinate rate. + // Callers SweepAll wires this; tests may pass a fresh registry. + Limiters *recon.LimiterRegistry + // Client is the shared retry HTTP wrapper. If nil, a default is used. + Client *Client +} + +// replitLinkRE matches /@/ result links (no further slashes). +var replitLinkRE = regexp.MustCompile(`^/@[^/]+/[^/]+$`) + +// Compile-time assertion that ReplitSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*ReplitSource)(nil) + +func (s *ReplitSource) Name() string { return "replit" } +func (s *ReplitSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) } +func (s *ReplitSource) Burst() int { return 1 } +func (s *ReplitSource) RespectsRobots() bool { return true } + +// Enabled always returns true: Replit scraping requires no credentials. +func (s *ReplitSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates each provider keyword, performs a Replit search query, and +// extracts result anchors from the returned HTML. The engine coordinates +// robots.txt at a higher level; per-request rate limiting is delegated to +// Limiters when available so individual sweeps play nicely with SweepAll. +func (s *ReplitSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + base := s.BaseURL + if base == "" { + base = "https://replit.com" + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "replit") + if len(queries) == 0 { + // No registry → emit nothing, not an error. + return nil + } + + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + + searchURL := fmt.Sprintf("%s/search?q=%s&type=repls", base, url.QueryEscape(q)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) + if err != nil { + return fmt.Errorf("replit: build req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("replit: fetch: %w", err) + } + links, err := extractAnchorHrefs(resp.Body, replitLinkRE) + _ = resp.Body.Close() + if err != nil { + return fmt.Errorf("replit: parse html: %w", err) + } + + for _, href := range links { + if err := ctx.Err(); err != nil { + return err + } + absURL := base + href + out <- recon.Finding{ + ProviderName: "", + Source: absURL, + SourceType: "recon:replit", + Confidence: "low", + DetectedAt: time.Now(), + } + } + } + return nil +} + +// extractAnchorHrefs walks parsed HTML and returns every attribute +// value whose path matches the given regexp. Duplicates are preserved in order. +func extractAnchorHrefs(body io.Reader, re *regexp.Regexp) ([]string, error) { + doc, err := html.Parse(body) + if err != nil { + return nil, err + } + var out []string + seen := make(map[string]struct{}) + var walk func(*html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "a" { + for _, a := range n.Attr { + if a.Key == "href" && re.MatchString(a.Val) { + if _, ok := seen[a.Val]; !ok { + seen[a.Val] = struct{}{} + out = append(out, a.Val) + } + break + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) + return out, nil +} diff --git a/pkg/recon/sources/replit_test.go b/pkg/recon/sources/replit_test.go new file mode 100644 index 0000000..7eab809 --- /dev/null +++ b/pkg/recon/sources/replit_test.go @@ -0,0 +1,131 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func replitTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +const replitFixtureHTML = ` + + hit one + hit two + nope + external + too deep nope +` + +func newReplitTestSource(srvURL string) *ReplitSource { + return &ReplitSource{ + BaseURL: srvURL, + Registry: replitTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + } +} + +func TestReplit_Sweep_ExtractsFindings(t *testing.T) { + var hits int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/search" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if r.URL.Query().Get("q") == "" { + t.Errorf("missing q param") + } + hits++ + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(replitFixtureHTML)) + })) + defer srv.Close() + + src := newReplitTestSource(srv.URL) + out := make(chan recon.Finding, 16) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if len(findings) != 2 { + t.Fatalf("expected 2 findings, got %d", len(findings)) + } + wantA := srv.URL + "/@alice/super-bot" + wantB := srv.URL + "/@bob/weather-api" + got := map[string]bool{} + for _, f := range findings { + got[f.Source] = true + if f.SourceType != "recon:replit" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + } + if !got[wantA] || !got[wantB] { + t.Fatalf("missing expected sources; got=%v", got) + } + if hits == 0 { + t.Fatalf("server was never hit") + } +} + +func TestReplit_RespectsRobots(t *testing.T) { + s := &ReplitSource{} + if !s.RespectsRobots() { + t.Fatal("expected RespectsRobots=true") + } +} + +func TestReplit_EnabledAlwaysTrue(t *testing.T) { + s := &ReplitSource{} + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } +} + +func TestReplit_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + _, _ = w.Write([]byte(replitFixtureHTML)) + })) + defer srv.Close() + + src := newReplitTestSource(srv.URL) + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestReplit_NameAndRate(t *testing.T) { + s := &ReplitSource{} + if s.Name() != "replit" { + t.Errorf("unexpected name: %s", s.Name()) + } + if s.Burst() != 1 { + t.Errorf("burst: %d", s.Burst()) + } + want := float64(1) / 6 + got := float64(s.RateLimit()) + if got < want-0.01 || got > want+0.01 { + t.Errorf("rate limit=%v want~%v", got, want) + } +}