From ecebffd27d95e69fb6d6f75cce38dcc2fa9c42ad Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Mon, 6 Apr 2026 01:18:15 +0300 Subject: [PATCH] feat(10-07): add SandboxesSource aggregator (codepen/jsfiddle/stackblitz/glitch/observable) - Single ReconSource umbrella iterating per-platform HTML or JSON search endpoints - Per-platform failures logged and skipped (log-and-continue); ctx cancel aborts fast - Sub-platform identifier encoded in Finding.KeyMasked as 'platform=' (pragmatic slot) - Gitpod intentionally omitted (no public search) - 5 httptest-backed tests covering HTML+JSON extraction, platform-failure tolerance, ctx cancel --- pkg/recon/sources/sandboxes.go | 248 ++++++++++++++++++++++++++++ pkg/recon/sources/sandboxes_test.go | 180 ++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 pkg/recon/sources/sandboxes.go create mode 100644 pkg/recon/sources/sandboxes_test.go diff --git a/pkg/recon/sources/sandboxes.go b/pkg/recon/sources/sandboxes.go new file mode 100644 index 0000000..b59af03 --- /dev/null +++ b/pkg/recon/sources/sandboxes.go @@ -0,0 +1,248 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "log" + "net/http" + "net/url" + "regexp" + "strings" + "time" + + "golang.org/x/time/rate" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// subPlatform describes one sandbox/IDE sub-source aggregated under the +// "sandboxes" umbrella. Each sub-platform is either HTML (ResultLinkRegex +// anchors) or JSON (JSONItemsKey → array of objects with JSONURLKey strings). +// +// SearchPath is a printf format string containing exactly one %s placeholder +// for the URL-escaped query keyword. It may be either: +// - an absolute URL (e.g. "https://codepen.io/search/pens?q=%s") used in +// production; or +// - a relative path (e.g. "/codepen-search?q=%s") used in tests that inject +// BaseURL pointing at an httptest server. +type subPlatform struct { + Name string + SearchPath string + ResultLinkRegex string + IsJSON bool + JSONItemsKey string + JSONURLKey string +} + +// defaultPlatforms is the production sub-platform list. +// +// Gitpod is intentionally omitted: gitpod.io exposes no public search index +// at time of writing (verified 2026-04). When a search endpoint appears, add +// it here — no other code changes required. +var defaultPlatforms = []subPlatform{ + { + Name: "codepen", + SearchPath: "https://codepen.io/search/pens?q=%s", + ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, + IsJSON: false, + }, + { + Name: "jsfiddle", + SearchPath: "https://jsfiddle.net/api/search/?q=%s", + IsJSON: true, + JSONItemsKey: "results", + JSONURLKey: "url", + }, + { + Name: "stackblitz", + SearchPath: "https://stackblitz.com/search?q=%s", + ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+$`, + IsJSON: false, + }, + { + Name: "glitch", + SearchPath: "https://glitch.com/api/search/projects?q=%s", + IsJSON: true, + JSONItemsKey: "results", + JSONURLKey: "url", + }, + { + Name: "observable", + SearchPath: "https://observablehq.com/search?query=%s", + ResultLinkRegex: `^/@[^/]+/[^/]+$`, + IsJSON: false, + }, +} + +// SandboxesSource aggregates several sandbox/IDE platforms into a single +// ReconSource. Each sub-platform is scraped independently; failures in one +// are logged and skipped without aborting the others. +// +// Every emitted Finding carries SourceType="recon:sandboxes" and encodes the +// originating sub-platform in KeyMasked as "platform=" (pragmatic slot +// until engine.Finding exposes a structured Metadata field). +type SandboxesSource struct { + // Platforms is the list to iterate. When nil, defaultPlatforms is used. + Platforms []subPlatform + Registry *providers.Registry + Limiters *recon.LimiterRegistry + Client *Client + // BaseURL, when non-empty, is prefixed to any relative SearchPath (tests). + BaseURL string +} + +// Compile-time assertion that SandboxesSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*SandboxesSource)(nil) + +func (s *SandboxesSource) Name() string { return "sandboxes" } +func (s *SandboxesSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) } +func (s *SandboxesSource) Burst() int { return 1 } +func (s *SandboxesSource) RespectsRobots() bool { return true } +func (s *SandboxesSource) Enabled(_ recon.Config) bool { return true } + +// Sweep iterates each sub-platform across each provider keyword. Per-platform +// errors are logged and swallowed so one broken sub-source does not fail the +// overall sweep. Ctx cancellation is honored between every request. +func (s *SandboxesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { + plats := s.Platforms + if plats == nil { + plats = defaultPlatforms + } + client := s.Client + if client == nil { + client = NewClient() + } + + queries := BuildQueries(s.Registry, "sandboxes") + if len(queries) == 0 { + return nil + } + + for _, p := range plats { + if err := ctx.Err(); err != nil { + return err + } + for _, q := range queries { + if err := ctx.Err(); err != nil { + return err + } + if s.Limiters != nil { + if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { + return err + } + } + if err := s.sweepPlatform(ctx, client, p, q, out); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + log.Printf("sandboxes: platform %q failed (skipping): %v", p.Name, err) + // Move to next platform — no point retrying more queries on a dead endpoint. + break + } + } + } + return nil +} + +// sweepPlatform performs one search request for one sub-platform and emits +// matching Findings to out. +func (s *SandboxesSource) sweepPlatform( + ctx context.Context, + client *Client, + p subPlatform, + query string, + out chan<- recon.Finding, +) error { + rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query)) + if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") { + rawURL = s.BaseURL + rawURL + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("build req: %w", err) + } + resp, err := client.Do(ctx, req) + if err != nil { + return fmt.Errorf("fetch: %w", err) + } + defer resp.Body.Close() + + var sources []string + if p.IsJSON { + sources, err = extractJSONURLs(resp.Body, p.JSONItemsKey, p.JSONURLKey) + if err != nil { + return fmt.Errorf("parse json: %w", err) + } + } else { + re, err := regexp.Compile(p.ResultLinkRegex) + if err != nil { + return fmt.Errorf("bad regex: %w", err) + } + hrefs, err := extractAnchorHrefs(resp.Body, re) + if err != nil { + return fmt.Errorf("parse html: %w", err) + } + // Absolute-ize hrefs using request URL's scheme+host. + scheme := req.URL.Scheme + host := req.URL.Host + for _, h := range hrefs { + sources = append(sources, fmt.Sprintf("%s://%s%s", scheme, host, h)) + } + } + + for _, src := range sources { + if err := ctx.Err(); err != nil { + return err + } + out <- recon.Finding{ + Source: src, + SourceType: "recon:sandboxes", + KeyMasked: "platform=" + p.Name, + Confidence: "low", + DetectedAt: time.Now(), + } + } + return nil +} + +// extractJSONURLs decodes a response body of the shape +// `{ "": [ { "": "https://..." }, ... ] }` and returns the +// list of URL strings. Missing keys return an empty slice, not an error. +func extractJSONURLs(body io.Reader, itemsKey, urlKey string) ([]string, error) { + raw, err := io.ReadAll(io.LimitReader(body, 1<<20)) // 1 MiB cap + if err != nil { + return nil, err + } + var envelope map[string]json.RawMessage + if err := json.Unmarshal(raw, &envelope); err != nil { + return nil, err + } + items, ok := envelope[itemsKey] + if !ok { + return nil, nil + } + var arr []map[string]json.RawMessage + if err := json.Unmarshal(items, &arr); err != nil { + return nil, err + } + out := make([]string, 0, len(arr)) + for _, obj := range arr { + v, ok := obj[urlKey] + if !ok { + continue + } + var s string + if err := json.Unmarshal(v, &s); err != nil { + continue + } + if s != "" { + out = append(out, s) + } + } + return out, nil +} diff --git a/pkg/recon/sources/sandboxes_test.go b/pkg/recon/sources/sandboxes_test.go new file mode 100644 index 0000000..121a87f --- /dev/null +++ b/pkg/recon/sources/sandboxes_test.go @@ -0,0 +1,180 @@ +package sources + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +func sandboxesTestRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + }) +} + +// sandboxesTestServer serves: +// - /codepen-search : HTML with pen anchors +// - /jsfiddle-search : JSON with results +// - /fail-search : 500 to exercise per-platform failure tolerance +func sandboxesTestServer(t *testing.T) *httptest.Server { + t.Helper() + mux := http.NewServeMux() + mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html") + _, _ = w.Write([]byte(` + one + two + skip + `)) + }) + mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"results":[ + {"url":"https://jsfiddle.net/u/abcd1234/"}, + {"url":"https://jsfiddle.net/u/wxyz5678/"} + ]}`)) + }) + mux.HandleFunc("/fail-search", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("boom")) + }) + return httptest.NewServer(mux) +} + +func newSandboxesTestSource(srvURL string, plats []subPlatform) *SandboxesSource { + return &SandboxesSource{ + Platforms: plats, + Registry: sandboxesTestRegistry(), + Limiters: recon.NewLimiterRegistry(), + Client: NewClient(), + BaseURL: srvURL, + } +} + +func TestSandboxes_Sweep_HTMLAndJSON(t *testing.T) { + srv := sandboxesTestServer(t) + defer srv.Close() + + plats := []subPlatform{ + {Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false}, + {Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"}, + } + src := newSandboxesTestSource(srv.URL, plats) + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err: %v", err) + } + close(out) + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + // codepen: 2 hits, jsfiddle: 2 hits + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d: %+v", len(findings), findings) + } + + platforms := map[string]int{} + for _, f := range findings { + if f.SourceType != "recon:sandboxes" { + t.Errorf("unexpected SourceType: %s", f.SourceType) + } + // sub-platform identifier is encoded into KeyMasked as "platform=" + platforms[f.KeyMasked]++ + } + if platforms["platform=codepen"] != 2 { + t.Errorf("expected 2 codepen findings, got %d", platforms["platform=codepen"]) + } + if platforms["platform=jsfiddle"] != 2 { + t.Errorf("expected 2 jsfiddle findings, got %d", platforms["platform=jsfiddle"]) + } +} + +func TestSandboxes_Sweep_FailingPlatformDoesNotAbortOthers(t *testing.T) { + srv := sandboxesTestServer(t) + defer srv.Close() + + plats := []subPlatform{ + {Name: "broken", SearchPath: "/fail-search?q=%s", ResultLinkRegex: `^/x$`, IsJSON: false}, + {Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false}, + } + src := newSandboxesTestSource(srv.URL, plats) + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := src.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep err (should be nil, log-and-continue): %v", err) + } + close(out) + + var n int + for f := range out { + if f.KeyMasked != "platform=codepen" { + t.Errorf("unexpected platform: %s", f.KeyMasked) + } + n++ + } + if n != 2 { + t.Fatalf("expected 2 codepen findings after broken platform skipped, got %d", n) + } +} + +func TestSandboxes_RespectsRobotsAndName(t *testing.T) { + s := &SandboxesSource{} + if !s.RespectsRobots() { + t.Fatal("expected RespectsRobots=true") + } + if s.Name() != "sandboxes" { + t.Fatalf("unexpected name: %s", s.Name()) + } + if !s.Enabled(recon.Config{}) { + t.Fatal("expected Enabled=true") + } + if s.Burst() != 1 { + t.Fatal("expected Burst=1") + } +} + +func TestSandboxes_Sweep_CtxCancelled(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(500 * time.Millisecond) + })) + defer srv.Close() + + plats := []subPlatform{ + {Name: "codepen", SearchPath: "/s?q=%s", ResultLinkRegex: `^/x$`, IsJSON: false}, + } + src := newSandboxesTestSource(srv.URL, plats) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 4) + if err := src.Sweep(ctx, "", out); err == nil { + t.Fatal("expected ctx error") + } +} + +func TestSandboxes_DefaultPlatformsListed(t *testing.T) { + // Sanity check: defaultPlatforms should contain the five documented sub-platforms. + want := map[string]bool{"codepen": true, "jsfiddle": true, "stackblitz": true, "glitch": true, "observable": true} + got := map[string]bool{} + for _, p := range defaultPlatforms { + got[p.Name] = true + } + for k := range want { + if !got[k] { + t.Errorf("missing default platform: %s", k) + } + } +}