package sources import ( "context" "encoding/json" "errors" "fmt" "io" "log" "net/http" "net/url" "regexp" "strings" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // subPlatform describes one sandbox/IDE sub-source aggregated under the // "sandboxes" umbrella. Each sub-platform is either HTML (ResultLinkRegex // anchors) or JSON (JSONItemsKey → array of objects with JSONURLKey strings). // // SearchPath is a printf format string containing exactly one %s placeholder // for the URL-escaped query keyword. It may be either: // - an absolute URL (e.g. "https://codepen.io/search/pens?q=%s") used in // production; or // - a relative path (e.g. "/codepen-search?q=%s") used in tests that inject // BaseURL pointing at an httptest server. type subPlatform struct { Name string SearchPath string ResultLinkRegex string IsJSON bool JSONItemsKey string JSONURLKey string } // defaultPlatforms is the production sub-platform list. // // Gitpod is intentionally omitted: gitpod.io exposes no public search index // at time of writing (verified 2026-04). When a search endpoint appears, add // it here — no other code changes required. var defaultPlatforms = []subPlatform{ { Name: "codepen", SearchPath: "https://codepen.io/search/pens?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false, }, { Name: "jsfiddle", SearchPath: "https://jsfiddle.net/api/search/?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url", }, { Name: "stackblitz", SearchPath: "https://stackblitz.com/search?q=%s", ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+$`, IsJSON: false, }, { Name: "glitch", SearchPath: "https://glitch.com/api/search/projects?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url", }, { Name: "observable", SearchPath: "https://observablehq.com/search?query=%s", ResultLinkRegex: `^/@[^/]+/[^/]+$`, IsJSON: false, }, } // SandboxesSource aggregates several sandbox/IDE platforms into a single // ReconSource. Each sub-platform is scraped independently; failures in one // are logged and skipped without aborting the others. // // Every emitted Finding carries SourceType="recon:sandboxes" and encodes the // originating sub-platform in KeyMasked as "platform=" (pragmatic slot // until engine.Finding exposes a structured Metadata field). type SandboxesSource struct { // Platforms is the list to iterate. When nil, defaultPlatforms is used. Platforms []subPlatform Registry *providers.Registry Limiters *recon.LimiterRegistry Client *Client // BaseURL, when non-empty, is prefixed to any relative SearchPath (tests). BaseURL string } // Compile-time assertion that SandboxesSource satisfies recon.ReconSource. var _ recon.ReconSource = (*SandboxesSource)(nil) func (s *SandboxesSource) Name() string { return "sandboxes" } func (s *SandboxesSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) } func (s *SandboxesSource) Burst() int { return 1 } func (s *SandboxesSource) RespectsRobots() bool { return true } func (s *SandboxesSource) Enabled(_ recon.Config) bool { return true } // Sweep iterates each sub-platform across each provider keyword. Per-platform // errors are logged and swallowed so one broken sub-source does not fail the // overall sweep. Ctx cancellation is honored between every request. func (s *SandboxesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { plats := s.Platforms if plats == nil { plats = defaultPlatforms } client := s.Client if client == nil { client = NewClient() } queries := BuildQueries(s.Registry, "sandboxes") if len(queries) == 0 { return nil } for _, p := range plats { if err := ctx.Err(); err != nil { return err } for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } if err := s.sweepPlatform(ctx, client, p, q, out); err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return err } log.Printf("sandboxes: platform %q failed (skipping): %v", p.Name, err) // Move to next platform — no point retrying more queries on a dead endpoint. break } } } return nil } // sweepPlatform performs one search request for one sub-platform and emits // matching Findings to out. func (s *SandboxesSource) sweepPlatform( ctx context.Context, client *Client, p subPlatform, query string, out chan<- recon.Finding, ) error { rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query)) if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") { rawURL = s.BaseURL + rawURL } req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) if err != nil { return fmt.Errorf("build req: %w", err) } resp, err := client.Do(ctx, req) if err != nil { return fmt.Errorf("fetch: %w", err) } defer resp.Body.Close() var sources []string if p.IsJSON { sources, err = extractJSONURLs(resp.Body, p.JSONItemsKey, p.JSONURLKey) if err != nil { return fmt.Errorf("parse json: %w", err) } } else { re, err := regexp.Compile(p.ResultLinkRegex) if err != nil { return fmt.Errorf("bad regex: %w", err) } hrefs, err := extractAnchorHrefs(resp.Body, re) if err != nil { return fmt.Errorf("parse html: %w", err) } // Absolute-ize hrefs using request URL's scheme+host. scheme := req.URL.Scheme host := req.URL.Host for _, h := range hrefs { sources = append(sources, fmt.Sprintf("%s://%s%s", scheme, host, h)) } } for _, src := range sources { if err := ctx.Err(); err != nil { return err } out <- recon.Finding{ Source: src, SourceType: "recon:sandboxes", KeyMasked: "platform=" + p.Name, Confidence: "low", DetectedAt: time.Now(), } } return nil } // extractJSONURLs decodes a response body of the shape // `{ "": [ { "": "https://..." }, ... ] }` and returns the // list of URL strings. Missing keys return an empty slice, not an error. func extractJSONURLs(body io.Reader, itemsKey, urlKey string) ([]string, error) { raw, err := io.ReadAll(io.LimitReader(body, 1<<20)) // 1 MiB cap if err != nil { return nil, err } var envelope map[string]json.RawMessage if err := json.Unmarshal(raw, &envelope); err != nil { return nil, err } items, ok := envelope[itemsKey] if !ok { return nil, nil } var arr []map[string]json.RawMessage if err := json.Unmarshal(items, &arr); err != nil { return nil, err } out := make([]string, 0, len(arr)) for _, obj := range arr { v, ok := obj[urlKey] if !ok { continue } var s string if err := json.Unmarshal(v, &s); err != nil { continue } if s != "" { out = append(out, s) } } return out, nil }