keyhunter/pkg/recon/sources/duckduckgo.go

package sources

import (
	"context"
	"fmt"
	"net/http"
	"net/url"
	"regexp"
	"time"

	"golang.org/x/time/rate"

	"github.com/salvacybersec/keyhunter/pkg/providers"
	"github.com/salvacybersec/keyhunter/pkg/recon"
)

// DuckDuckGoSource implements recon.ReconSource by scraping DuckDuckGo's HTML
// search endpoint. No API key is required -- this source is always enabled.
//
// It operates conservatively (2s per request) and declares RespectsRobots=true.
type DuckDuckGoSource struct {
	BaseURL  string
	Registry *providers.Registry
	Limiters *recon.LimiterRegistry
	client   *Client
}

// Compile-time assertion.
var _ recon.ReconSource = (*DuckDuckGoSource)(nil)

// ddgResultRE matches DuckDuckGo HTML result links. The HTML search page uses
// <a class="result__a" href="..."> anchors for organic results.
var ddgResultRE = regexp.MustCompile(`^https?://`)

// NewDuckDuckGoSource constructs a DuckDuckGoSource with the shared retry client.
func NewDuckDuckGoSource(reg *providers.Registry, lim *recon.LimiterRegistry) *DuckDuckGoSource {
	return &DuckDuckGoSource{
		BaseURL:  "https://html.duckduckgo.com",
		Registry: reg,
		Limiters: lim,
		client:   NewClient(),
	}
}

func (s *DuckDuckGoSource) Name() string          { return "duckduckgo" }
func (s *DuckDuckGoSource) RateLimit() rate.Limit { return rate.Every(2 * time.Second) }
func (s *DuckDuckGoSource) Burst() int            { return 1 }
func (s *DuckDuckGoSource) RespectsRobots() bool  { return true }

// Enabled always returns true -- DuckDuckGo HTML scraping requires no credentials.
func (s *DuckDuckGoSource) Enabled(_ recon.Config) bool { return true }

// Sweep iterates provider keywords, scrapes DuckDuckGo HTML search, and emits
// a Finding per result link.
func (s *DuckDuckGoSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
	base := s.BaseURL
	if base == "" {
		base = "https://html.duckduckgo.com"
	}
	client := s.client
	if client == nil {
		client = NewClient()
	}

	queries := BuildQueries(s.Registry, "duckduckgo")
	if len(queries) == 0 {
		return nil
	}

	for _, q := range queries {
		if err := ctx.Err(); err != nil {
			return err
		}
		if s.Limiters != nil {
			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
				return err
			}
		}

		searchURL := fmt.Sprintf("%s/html/?q=%s", base, url.QueryEscape(q))
		req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
		if err != nil {
			return fmt.Errorf("duckduckgo: build req: %w", err)
		}
		req.Header.Set("User-Agent", "keyhunter-recon")

		resp, err := client.Do(ctx, req)
		if err != nil {
			// Transient failures: continue to next query.
			continue
		}
		links, parseErr := extractAnchorHrefs(resp.Body, ddgResultRE)
		_ = resp.Body.Close()
		if parseErr != nil {
			continue
		}

		for _, href := range links {
			if err := ctx.Err(); err != nil {
				return err
			}
			f := recon.Finding{
				Source:     href,
				SourceType: "recon:duckduckgo",
				Confidence: "low",
				DetectedAt: time.Now(),
			}
			select {
			case out <- f:
			case <-ctx.Done():
				return ctx.Err()
			}
		}
	}
	return nil
}