keyhunter/pkg/recon/sources/replit.go

package sources

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"regexp"
	"time"

	"golang.org/x/net/html"
	"golang.org/x/time/rate"

	"github.com/salvacybersec/keyhunter/pkg/providers"
	"github.com/salvacybersec/keyhunter/pkg/recon"
)

// ReplitSource scrapes replit.com search HTML for references to provider
// keywords. Replit has no public search API, so we honor robots.txt and run
// at a very conservative 10 req/min rate.
//
// Emits one Finding per extracted result link, tagged SourceType=recon:replit.
type ReplitSource struct {
	// BaseURL defaults to https://replit.com. Tests override with httptest URL.
	BaseURL string
	// Registry drives the keyword query list via BuildQueries.
	Registry *providers.Registry
	// Limiters is the shared recon.LimiterRegistry used to coordinate rate.
	// Callers SweepAll wires this; tests may pass a fresh registry.
	Limiters *recon.LimiterRegistry
	// Client is the shared retry HTTP wrapper. If nil, a default is used.
	Client *Client
}

// replitLinkRE matches /@<user>/<repl-name> result links (no further slashes).
var replitLinkRE = regexp.MustCompile(`^/@[^/]+/[^/]+$`)

// Compile-time assertion that ReplitSource satisfies recon.ReconSource.
var _ recon.ReconSource = (*ReplitSource)(nil)

func (s *ReplitSource) Name() string          { return "replit" }
func (s *ReplitSource) RateLimit() rate.Limit { return rate.Every(6 * time.Second) }
func (s *ReplitSource) Burst() int            { return 1 }
func (s *ReplitSource) RespectsRobots() bool  { return true }

// Enabled always returns true: Replit scraping requires no credentials.
func (s *ReplitSource) Enabled(_ recon.Config) bool { return true }

// Sweep iterates each provider keyword, performs a Replit search query, and
// extracts result anchors from the returned HTML. The engine coordinates
// robots.txt at a higher level; per-request rate limiting is delegated to
// Limiters when available so individual sweeps play nicely with SweepAll.
func (s *ReplitSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
	base := s.BaseURL
	if base == "" {
		base = "https://replit.com"
	}
	client := s.Client
	if client == nil {
		client = NewClient()
	}

	queries := BuildQueries(s.Registry, "replit")
	if len(queries) == 0 {
		// No registry → emit nothing, not an error.
		return nil
	}

	for _, q := range queries {
		if err := ctx.Err(); err != nil {
			return err
		}

		if s.Limiters != nil {
			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
				return err
			}
		}

		searchURL := fmt.Sprintf("%s/search?q=%s&type=repls", base, url.QueryEscape(q))
		req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
		if err != nil {
			return fmt.Errorf("replit: build req: %w", err)
		}
		resp, err := client.Do(ctx, req)
		if err != nil {
			return fmt.Errorf("replit: fetch: %w", err)
		}
		links, err := extractAnchorHrefs(resp.Body, replitLinkRE)
		_ = resp.Body.Close()
		if err != nil {
			return fmt.Errorf("replit: parse html: %w", err)
		}

		for _, href := range links {
			if err := ctx.Err(); err != nil {
				return err
			}
			absURL := base + href
			out <- recon.Finding{
				ProviderName: "",
				Source:       absURL,
				SourceType:   "recon:replit",
				Confidence:   "low",
				DetectedAt:   time.Now(),
			}
		}
	}
	return nil
}

// extractAnchorHrefs walks parsed HTML and returns every <a href> attribute
// value whose path matches the given regexp. Duplicates are preserved in order.
func extractAnchorHrefs(body io.Reader, re *regexp.Regexp) ([]string, error) {
	doc, err := html.Parse(body)
	if err != nil {
		return nil, err
	}
	var out []string
	seen := make(map[string]struct{})
	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "a" {
			for _, a := range n.Attr {
				if a.Key == "href" && re.MatchString(a.Val) {
					if _, ok := seen[a.Val]; !ok {
						seen[a.Val] = struct{}{}
						out = append(out, a.Val)
					}
					break
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)
	return out, nil
}