feat(10-07): add Replit and CodeSandbox scraping sources

- ReplitSource scrapes /search HTML extracting /@user/repl anchors - CodeSandboxSource scrapes /search HTML extracting /s/slug anchors - Both use golang.org/x/net/html parser, 10 req/min rate, RespectsRobots=true - 10 httptest-backed tests covering extraction, ctx cancel, rate/name assertions
2026-04-06 01:16:39 +03:00
parent ab636dc5e1
commit 62a347f476
4 changed files with 476 additions and 0 deletions
--- a/pkg/recon/sources/replit.go
+++ b/pkg/recon/sources/replit.go
@@ -0,0 +1,141 @@
+package sources
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"regexp"
+	"time"
+
+	"golang.org/x/net/html"
+	"golang.org/x/time/rate"
+
+	"github.com/salvacybersec/keyhunter/pkg/providers"
+	"github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+// ReplitSource scrapes replit.com search HTML for references to provider
+// keywords. Replit has no public search API, so we honor robots.txt and run
+// at a very conservative 10 req/min rate.
+//
+// Emits one Finding per extracted result link, tagged SourceType=recon:replit.
+type ReplitSource struct {
+	// BaseURL defaults to https://replit.com. Tests override with httptest URL.
+	BaseURL string
+	// Registry drives the keyword query list via BuildQueries.
+	Registry *providers.Registry
+	// Limiters is the shared recon.LimiterRegistry used to coordinate rate.
+	// Callers SweepAll wires this; tests may pass a fresh registry.
+	Limiters *recon.LimiterRegistry
+	// Client is the shared retry HTTP wrapper. If nil, a default is used.
+	Client *Client
+}
+
+// replitLinkRE matches /@<user>/<repl-name> result links (no further slashes).
+var replitLinkRE = regexp.MustCompile(`^/@[^/]+/[^/]+$`)
+
+// Compile-time assertion that ReplitSource satisfies recon.ReconSource.
+var _ recon.ReconSource = (*ReplitSource)(nil)
+
+func (s *ReplitSource) Name() string          { return "replit" }
+func (s *ReplitSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) }
+func (s *ReplitSource) Burst() int            { return 1 }
+func (s *ReplitSource) RespectsRobots() bool  { return true }
+
+// Enabled always returns true: Replit scraping requires no credentials.
+func (s *ReplitSource) Enabled(_ recon.Config) bool { return true }
+
+// Sweep iterates each provider keyword, performs a Replit search query, and
+// extracts result anchors from the returned HTML. The engine coordinates
+// robots.txt at a higher level; per-request rate limiting is delegated to
+// Limiters when available so individual sweeps play nicely with SweepAll.
+func (s *ReplitSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+	base := s.BaseURL
+	if base == "" {
+		base = "https://replit.com"
+	}
+	client := s.Client
+	if client == nil {
+		client = NewClient()
+	}
+
+	queries := BuildQueries(s.Registry, "replit")
+	if len(queries) == 0 {
+		// No registry → emit nothing, not an error.
+		return nil
+	}
+
+	for _, q := range queries {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+
+		if s.Limiters != nil {
+			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
+				return err
+			}
+		}
+
+		searchURL := fmt.Sprintf("%s/search?q=%s&type=repls", base, url.QueryEscape(q))
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
+		if err != nil {
+			return fmt.Errorf("replit: build req: %w", err)
+		}
+		resp, err := client.Do(ctx, req)
+		if err != nil {
+			return fmt.Errorf("replit: fetch: %w", err)
+		}
+		links, err := extractAnchorHrefs(resp.Body, replitLinkRE)
+		_ = resp.Body.Close()
+		if err != nil {
+			return fmt.Errorf("replit: parse html: %w", err)
+		}
+
+		for _, href := range links {
+			if err := ctx.Err(); err != nil {
+				return err
+			}
+			absURL := base + href
+			out <- recon.Finding{
+				ProviderName: "",
+				Source:       absURL,
+				SourceType:   "recon:replit",
+				Confidence:   "low",
+				DetectedAt:   time.Now(),
+			}
+		}
+	}
+	return nil
+}
+
+// extractAnchorHrefs walks parsed HTML and returns every <a href> attribute
+// value whose path matches the given regexp. Duplicates are preserved in order.
+func extractAnchorHrefs(body io.Reader, re *regexp.Regexp) ([]string, error) {
+	doc, err := html.Parse(body)
+	if err != nil {
+		return nil, err
+	}
+	var out []string
+	seen := make(map[string]struct{})
+	var walk func(*html.Node)
+	walk = func(n *html.Node) {
+		if n.Type == html.ElementNode && n.Data == "a" {
+			for _, a := range n.Attr {
+				if a.Key == "href" && re.MatchString(a.Val) {
+					if _, ok := seen[a.Val]; !ok {
+						seen[a.Val] = struct{}{}
+						out = append(out, a.Val)
+					}
+					break
+				}
+			}
+		}
+		for c := n.FirstChild; c != nil; c = c.NextSibling {
+			walk(c)
+		}
+	}
+	walk(doc)
+	return out, nil
+}