feat(10-07): add SandboxesSource aggregator (codepen/jsfiddle/stackblitz/glitch/observable)
- Single ReconSource umbrella iterating per-platform HTML or JSON search endpoints - Per-platform failures logged and skipped (log-and-continue); ctx cancel aborts fast - Sub-platform identifier encoded in Finding.KeyMasked as 'platform=<name>' (pragmatic slot) - Gitpod intentionally omitted (no public search) - 5 httptest-backed tests covering HTML+JSON extraction, platform-failure tolerance, ctx cancel
This commit is contained in:
248
pkg/recon/sources/sandboxes.go
Normal file
248
pkg/recon/sources/sandboxes.go
Normal file
@@ -0,0 +1,248 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// subPlatform describes one sandbox/IDE sub-source aggregated under the
|
||||
// "sandboxes" umbrella. Each sub-platform is either HTML (ResultLinkRegex
|
||||
// anchors) or JSON (JSONItemsKey → array of objects with JSONURLKey strings).
|
||||
//
|
||||
// SearchPath is a printf format string containing exactly one %s placeholder
|
||||
// for the URL-escaped query keyword. It may be either:
|
||||
// - an absolute URL (e.g. "https://codepen.io/search/pens?q=%s") used in
|
||||
// production; or
|
||||
// - a relative path (e.g. "/codepen-search?q=%s") used in tests that inject
|
||||
// BaseURL pointing at an httptest server.
|
||||
type subPlatform struct {
|
||||
Name string
|
||||
SearchPath string
|
||||
ResultLinkRegex string
|
||||
IsJSON bool
|
||||
JSONItemsKey string
|
||||
JSONURLKey string
|
||||
}
|
||||
|
||||
// defaultPlatforms is the production sub-platform list.
|
||||
//
|
||||
// Gitpod is intentionally omitted: gitpod.io exposes no public search index
|
||||
// at time of writing (verified 2026-04). When a search endpoint appears, add
|
||||
// it here — no other code changes required.
|
||||
var defaultPlatforms = []subPlatform{
|
||||
{
|
||||
Name: "codepen",
|
||||
SearchPath: "https://codepen.io/search/pens?q=%s",
|
||||
ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`,
|
||||
IsJSON: false,
|
||||
},
|
||||
{
|
||||
Name: "jsfiddle",
|
||||
SearchPath: "https://jsfiddle.net/api/search/?q=%s",
|
||||
IsJSON: true,
|
||||
JSONItemsKey: "results",
|
||||
JSONURLKey: "url",
|
||||
},
|
||||
{
|
||||
Name: "stackblitz",
|
||||
SearchPath: "https://stackblitz.com/search?q=%s",
|
||||
ResultLinkRegex: `^/edit/[a-zA-Z0-9-]+$`,
|
||||
IsJSON: false,
|
||||
},
|
||||
{
|
||||
Name: "glitch",
|
||||
SearchPath: "https://glitch.com/api/search/projects?q=%s",
|
||||
IsJSON: true,
|
||||
JSONItemsKey: "results",
|
||||
JSONURLKey: "url",
|
||||
},
|
||||
{
|
||||
Name: "observable",
|
||||
SearchPath: "https://observablehq.com/search?query=%s",
|
||||
ResultLinkRegex: `^/@[^/]+/[^/]+$`,
|
||||
IsJSON: false,
|
||||
},
|
||||
}
|
||||
|
||||
// SandboxesSource aggregates several sandbox/IDE platforms into a single
|
||||
// ReconSource. Each sub-platform is scraped independently; failures in one
|
||||
// are logged and skipped without aborting the others.
|
||||
//
|
||||
// Every emitted Finding carries SourceType="recon:sandboxes" and encodes the
|
||||
// originating sub-platform in KeyMasked as "platform=<name>" (pragmatic slot
|
||||
// until engine.Finding exposes a structured Metadata field).
|
||||
type SandboxesSource struct {
|
||||
// Platforms is the list to iterate. When nil, defaultPlatforms is used.
|
||||
Platforms []subPlatform
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
Client *Client
|
||||
// BaseURL, when non-empty, is prefixed to any relative SearchPath (tests).
|
||||
BaseURL string
|
||||
}
|
||||
|
||||
// Compile-time assertion that SandboxesSource satisfies recon.ReconSource.
|
||||
var _ recon.ReconSource = (*SandboxesSource)(nil)
|
||||
|
||||
func (s *SandboxesSource) Name() string { return "sandboxes" }
|
||||
func (s *SandboxesSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) }
|
||||
func (s *SandboxesSource) Burst() int { return 1 }
|
||||
func (s *SandboxesSource) RespectsRobots() bool { return true }
|
||||
func (s *SandboxesSource) Enabled(_ recon.Config) bool { return true }
|
||||
|
||||
// Sweep iterates each sub-platform across each provider keyword. Per-platform
|
||||
// errors are logged and swallowed so one broken sub-source does not fail the
|
||||
// overall sweep. Ctx cancellation is honored between every request.
|
||||
func (s *SandboxesSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
|
||||
plats := s.Platforms
|
||||
if plats == nil {
|
||||
plats = defaultPlatforms
|
||||
}
|
||||
client := s.Client
|
||||
if client == nil {
|
||||
client = NewClient()
|
||||
}
|
||||
|
||||
queries := BuildQueries(s.Registry, "sandboxes")
|
||||
if len(queries) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, p := range plats {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, q := range queries {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if s.Limiters != nil {
|
||||
if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := s.sweepPlatform(ctx, client, p, q, out); err != nil {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return err
|
||||
}
|
||||
log.Printf("sandboxes: platform %q failed (skipping): %v", p.Name, err)
|
||||
// Move to next platform — no point retrying more queries on a dead endpoint.
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// sweepPlatform performs one search request for one sub-platform and emits
|
||||
// matching Findings to out.
|
||||
func (s *SandboxesSource) sweepPlatform(
|
||||
ctx context.Context,
|
||||
client *Client,
|
||||
p subPlatform,
|
||||
query string,
|
||||
out chan<- recon.Finding,
|
||||
) error {
|
||||
rawURL := fmt.Sprintf(p.SearchPath, url.QueryEscape(query))
|
||||
if s.BaseURL != "" && strings.HasPrefix(rawURL, "/") {
|
||||
rawURL = s.BaseURL + rawURL
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build req: %w", err)
|
||||
}
|
||||
resp, err := client.Do(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("fetch: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var sources []string
|
||||
if p.IsJSON {
|
||||
sources, err = extractJSONURLs(resp.Body, p.JSONItemsKey, p.JSONURLKey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse json: %w", err)
|
||||
}
|
||||
} else {
|
||||
re, err := regexp.Compile(p.ResultLinkRegex)
|
||||
if err != nil {
|
||||
return fmt.Errorf("bad regex: %w", err)
|
||||
}
|
||||
hrefs, err := extractAnchorHrefs(resp.Body, re)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse html: %w", err)
|
||||
}
|
||||
// Absolute-ize hrefs using request URL's scheme+host.
|
||||
scheme := req.URL.Scheme
|
||||
host := req.URL.Host
|
||||
for _, h := range hrefs {
|
||||
sources = append(sources, fmt.Sprintf("%s://%s%s", scheme, host, h))
|
||||
}
|
||||
}
|
||||
|
||||
for _, src := range sources {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
out <- recon.Finding{
|
||||
Source: src,
|
||||
SourceType: "recon:sandboxes",
|
||||
KeyMasked: "platform=" + p.Name,
|
||||
Confidence: "low",
|
||||
DetectedAt: time.Now(),
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractJSONURLs decodes a response body of the shape
|
||||
// `{ "<itemsKey>": [ { "<urlKey>": "https://..." }, ... ] }` and returns the
|
||||
// list of URL strings. Missing keys return an empty slice, not an error.
|
||||
func extractJSONURLs(body io.Reader, itemsKey, urlKey string) ([]string, error) {
|
||||
raw, err := io.ReadAll(io.LimitReader(body, 1<<20)) // 1 MiB cap
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var envelope map[string]json.RawMessage
|
||||
if err := json.Unmarshal(raw, &envelope); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items, ok := envelope[itemsKey]
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
var arr []map[string]json.RawMessage
|
||||
if err := json.Unmarshal(items, &arr); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := make([]string, 0, len(arr))
|
||||
for _, obj := range arr {
|
||||
v, ok := obj[urlKey]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
var s string
|
||||
if err := json.Unmarshal(v, &s); err != nil {
|
||||
continue
|
||||
}
|
||||
if s != "" {
|
||||
out = append(out, s)
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
180
pkg/recon/sources/sandboxes_test.go
Normal file
180
pkg/recon/sources/sandboxes_test.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
func sandboxesTestRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "openai", Keywords: []string{"sk-proj-"}},
|
||||
})
|
||||
}
|
||||
|
||||
// sandboxesTestServer serves:
|
||||
// - /codepen-search : HTML with pen anchors
|
||||
// - /jsfiddle-search : JSON with results
|
||||
// - /fail-search : 500 to exercise per-platform failure tolerance
|
||||
func sandboxesTestServer(t *testing.T) *httptest.Server {
|
||||
t.Helper()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/codepen-search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
_, _ = w.Write([]byte(`<html><body>
|
||||
<a href="/alice/pen/AbCdEf123">one</a>
|
||||
<a href="/bob/pen/ZzZz9999">two</a>
|
||||
<a href="/nope">skip</a>
|
||||
</body></html>`))
|
||||
})
|
||||
mux.HandleFunc("/jsfiddle-search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte(`{"results":[
|
||||
{"url":"https://jsfiddle.net/u/abcd1234/"},
|
||||
{"url":"https://jsfiddle.net/u/wxyz5678/"}
|
||||
]}`))
|
||||
})
|
||||
mux.HandleFunc("/fail-search", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
_, _ = w.Write([]byte("boom"))
|
||||
})
|
||||
return httptest.NewServer(mux)
|
||||
}
|
||||
|
||||
func newSandboxesTestSource(srvURL string, plats []subPlatform) *SandboxesSource {
|
||||
return &SandboxesSource{
|
||||
Platforms: plats,
|
||||
Registry: sandboxesTestRegistry(),
|
||||
Limiters: recon.NewLimiterRegistry(),
|
||||
Client: NewClient(),
|
||||
BaseURL: srvURL,
|
||||
}
|
||||
}
|
||||
|
||||
func TestSandboxes_Sweep_HTMLAndJSON(t *testing.T) {
|
||||
srv := sandboxesTestServer(t)
|
||||
defer srv.Close()
|
||||
|
||||
plats := []subPlatform{
|
||||
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
|
||||
{Name: "jsfiddle", SearchPath: "/jsfiddle-search?q=%s", IsJSON: true, JSONItemsKey: "results", JSONURLKey: "url"},
|
||||
}
|
||||
src := newSandboxesTestSource(srv.URL, plats)
|
||||
|
||||
out := make(chan recon.Finding, 32)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err: %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var findings []recon.Finding
|
||||
for f := range out {
|
||||
findings = append(findings, f)
|
||||
}
|
||||
// codepen: 2 hits, jsfiddle: 2 hits
|
||||
if len(findings) != 4 {
|
||||
t.Fatalf("expected 4 findings, got %d: %+v", len(findings), findings)
|
||||
}
|
||||
|
||||
platforms := map[string]int{}
|
||||
for _, f := range findings {
|
||||
if f.SourceType != "recon:sandboxes" {
|
||||
t.Errorf("unexpected SourceType: %s", f.SourceType)
|
||||
}
|
||||
// sub-platform identifier is encoded into KeyMasked as "platform=<name>"
|
||||
platforms[f.KeyMasked]++
|
||||
}
|
||||
if platforms["platform=codepen"] != 2 {
|
||||
t.Errorf("expected 2 codepen findings, got %d", platforms["platform=codepen"])
|
||||
}
|
||||
if platforms["platform=jsfiddle"] != 2 {
|
||||
t.Errorf("expected 2 jsfiddle findings, got %d", platforms["platform=jsfiddle"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestSandboxes_Sweep_FailingPlatformDoesNotAbortOthers(t *testing.T) {
|
||||
srv := sandboxesTestServer(t)
|
||||
defer srv.Close()
|
||||
|
||||
plats := []subPlatform{
|
||||
{Name: "broken", SearchPath: "/fail-search?q=%s", ResultLinkRegex: `^/x$`, IsJSON: false},
|
||||
{Name: "codepen", SearchPath: "/codepen-search?q=%s", ResultLinkRegex: `^/[^/]+/pen/[a-zA-Z0-9]+$`, IsJSON: false},
|
||||
}
|
||||
src := newSandboxesTestSource(srv.URL, plats)
|
||||
|
||||
out := make(chan recon.Finding, 32)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if err := src.Sweep(ctx, "", out); err != nil {
|
||||
t.Fatalf("Sweep err (should be nil, log-and-continue): %v", err)
|
||||
}
|
||||
close(out)
|
||||
|
||||
var n int
|
||||
for f := range out {
|
||||
if f.KeyMasked != "platform=codepen" {
|
||||
t.Errorf("unexpected platform: %s", f.KeyMasked)
|
||||
}
|
||||
n++
|
||||
}
|
||||
if n != 2 {
|
||||
t.Fatalf("expected 2 codepen findings after broken platform skipped, got %d", n)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSandboxes_RespectsRobotsAndName(t *testing.T) {
|
||||
s := &SandboxesSource{}
|
||||
if !s.RespectsRobots() {
|
||||
t.Fatal("expected RespectsRobots=true")
|
||||
}
|
||||
if s.Name() != "sandboxes" {
|
||||
t.Fatalf("unexpected name: %s", s.Name())
|
||||
}
|
||||
if !s.Enabled(recon.Config{}) {
|
||||
t.Fatal("expected Enabled=true")
|
||||
}
|
||||
if s.Burst() != 1 {
|
||||
t.Fatal("expected Burst=1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSandboxes_Sweep_CtxCancelled(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
plats := []subPlatform{
|
||||
{Name: "codepen", SearchPath: "/s?q=%s", ResultLinkRegex: `^/x$`, IsJSON: false},
|
||||
}
|
||||
src := newSandboxesTestSource(srv.URL, plats)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
out := make(chan recon.Finding, 4)
|
||||
if err := src.Sweep(ctx, "", out); err == nil {
|
||||
t.Fatal("expected ctx error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSandboxes_DefaultPlatformsListed(t *testing.T) {
|
||||
// Sanity check: defaultPlatforms should contain the five documented sub-platforms.
|
||||
want := map[string]bool{"codepen": true, "jsfiddle": true, "stackblitz": true, "glitch": true, "observable": true}
|
||||
got := map[string]bool{}
|
||||
for _, p := range defaultPlatforms {
|
||||
got[p.Name] = true
|
||||
}
|
||||
for k := range want {
|
||||
if !got[k] {
|
||||
t.Errorf("missing default platform: %s", k)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user