Files
keyhunter/pkg/recon/sources/sandboxes.go
salvacybersec ecebffd27d feat(10-07): add SandboxesSource aggregator (codepen/jsfiddle/stackblitz/glitch/observable)
- Single ReconSource umbrella iterating per-platform HTML or JSON search endpoints
- Per-platform failures logged and skipped (log-and-continue); ctx cancel aborts fast
- Sub-platform identifier encoded in Finding.KeyMasked as 'platform=<name>' (pragmatic slot)
- Gitpod intentionally omitted (no public search)
- 5 httptest-backed tests covering HTML+JSON extraction, platform-failure tolerance, ctx cancel
2026-04-06 01:18:15 +03:00

249 lines
7.0 KiB
Go

package sources
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"golang.org/x/time/rate"
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// subPlatform describes one sandbox/IDE sub-source aggregated under the
// "sandboxes" umbrella. Each sub-platform is either HTML (ResultLinkRegex
// anchors) or JSON (JSONItemsKey → array of objects with JSONURLKey strings).
//
// SearchPath is a printf format string containing exactly one %s placeholder
// for the URL-escaped query keyword. It may be either:
// - an absolute URL (e.g. "https://codepen.io/search/pens?q=%s") used in
// production; or
// - a relative path (e.g. "/codepen-search?q=%s") used in tests that inject
// BaseURL pointing at an httptest server.
type subPlatform struct {
Name string
SearchPath string
ResultLinkRegex string
IsJSON bool
JSONItemsKey string
JSONURLKey string
}
// defaultPlatforms is the production sub-platform list.
//
// Gitpod is intentionally omitted: gitpod.io exposes no public search index
// at time of writing (verified 2026-04). When a search endpoint appears, add
// it here — no other code changes required.
var defaultPlatforms = []subPlatform{
{
Name: "codepen",
SearchPath: "https://codepen.io/search/pens?q=%s",
ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`,
IsJSON: false,
},
{
Name: "jsfiddle",
SearchPath: "https://jsfiddle.net/api/search/?q=%s",
IsJSON: true,
JSONItemsKey: "results",
JSONURLKey: "url",
},
{
Name: "stackblitz",
SearchPath: "https://stackblitz.com/search?q=%s",
ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+$`,
IsJSON: false,
},
{
Name: "glitch",
SearchPath: "https://glitch.com/api/search/projects?q=%s",
IsJSON: true,
JSONItemsKey: "results",
JSONURLKey: "url",
},
{
Name: "observable",
SearchPath: "https://observablehq.com/search?query=%s",
ResultLinkRegex: `^/@[^/]+/[^/]+$`,
IsJSON: false,
},
}
// SandboxesSource aggregates several sandbox/IDE platforms into a single
// ReconSource. Each sub-platform is scraped independently; failures in one
// are logged and skipped without aborting the others.
//
// Every emitted Finding carries SourceType="recon:sandboxes" and encodes the
// originating sub-platform in KeyMasked as "platform=<name>" (pragmatic slot
// until engine.Finding exposes a structured Metadata field).
type SandboxesSource struct {
// Platforms is the list to iterate. When nil, defaultPlatforms is used.
Platforms []subPlatform
Registry *providers.Registry
Limiters *recon.LimiterRegistry
Client *Client
// BaseURL, when non-empty, is prefixed to any relative SearchPath (tests).
BaseURL string
}
// Compile-time assertion that SandboxesSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*SandboxesSource)(nil)
func (s *SandboxesSource) Name() string { return "sandboxes" }
func (s *SandboxesSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) }
func (s *SandboxesSource) Burst() int { return 1 }
func (s *SandboxesSource) RespectsRobots() bool { return true }
func (s *SandboxesSource) Enabled(_ recon.Config) bool { return true }
// Sweep iterates each sub-platform across each provider keyword. Per-platform
// errors are logged and swallowed so one broken sub-source does not fail the
// overall sweep. Ctx cancellation is honored between every request.
func (s *SandboxesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
plats := s.Platforms
if plats == nil {
plats = defaultPlatforms
}
client := s.Client
if client == nil {
client = NewClient()
}
queries := BuildQueries(s.Registry, "sandboxes")
if len(queries) == 0 {
return nil
}
for _, p := range plats {
if err := ctx.Err(); err != nil {
return err
}
for _, q := range queries {
if err := ctx.Err(); err != nil {
return err
}
if s.Limiters != nil {
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
return err
}
}
if err := s.sweepPlatform(ctx, client, p, q, out); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
log.Printf("sandboxes: platform %q failed (skipping): %v", p.Name, err)
// Move to next platform — no point retrying more queries on a dead endpoint.
break
}
}
}
return nil
}
// sweepPlatform performs one search request for one sub-platform and emits
// matching Findings to out.
func (s *SandboxesSource) sweepPlatform(
ctx context.Context,
client *Client,
p subPlatform,
query string,
out chan<- recon.Finding,
) error {
rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query))
if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") {
rawURL = s.BaseURL + rawURL
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return fmt.Errorf("build req: %w", err)
}
resp, err := client.Do(ctx, req)
if err != nil {
return fmt.Errorf("fetch: %w", err)
}
defer resp.Body.Close()
var sources []string
if p.IsJSON {
sources, err = extractJSONURLs(resp.Body, p.JSONItemsKey, p.JSONURLKey)
if err != nil {
return fmt.Errorf("parse json: %w", err)
}
} else {
re, err := regexp.Compile(p.ResultLinkRegex)
if err != nil {
return fmt.Errorf("bad regex: %w", err)
}
hrefs, err := extractAnchorHrefs(resp.Body, re)
if err != nil {
return fmt.Errorf("parse html: %w", err)
}
// Absolute-ize hrefs using request URL's scheme+host.
scheme := req.URL.Scheme
host := req.URL.Host
for _, h := range hrefs {
sources = append(sources, fmt.Sprintf("%s://%s%s", scheme, host, h))
}
}
for _, src := range sources {
if err := ctx.Err(); err != nil {
return err
}
out <- recon.Finding{
Source: src,
SourceType: "recon:sandboxes",
KeyMasked: "platform=" + p.Name,
Confidence: "low",
DetectedAt: time.Now(),
}
}
return nil
}
// extractJSONURLs decodes a response body of the shape
// `{ "<itemsKey>": [ { "<urlKey>": "https://..." }, ... ] }` and returns the
// list of URL strings. Missing keys return an empty slice, not an error.
func extractJSONURLs(body io.Reader, itemsKey, urlKey string) ([]string, error) {
raw, err := io.ReadAll(io.LimitReader(body, 1<<20)) // 1 MiB cap
if err != nil {
return nil, err
}
var envelope map[string]json.RawMessage
if err := json.Unmarshal(raw, &envelope); err != nil {
return nil, err
}
items, ok := envelope[itemsKey]
if !ok {
return nil, nil
}
var arr []map[string]json.RawMessage
if err := json.Unmarshal(items, &arr); err != nil {
return nil, err
}
out := make([]string, 0, len(arr))
for _, obj := range arr {
v, ok := obj[urlKey]
if !ok {
continue
}
var s string
if err := json.Unmarshal(v, &s); err != nil {
continue
}
if s != "" {
out = append(out, s)
}
}
return out, nil
}