keyhunter/pkg/recon/sources/commoncrawl.go

package sources

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"time"

	"golang.org/x/time/rate"

	"github.com/salvacybersec/keyhunter/pkg/providers"
	"github.com/salvacybersec/keyhunter/pkg/recon"
)

// CommonCrawlSource searches the Common Crawl index for web pages that may
// contain leaked API keys. Common Crawl archives petabytes of web content;
// its CDX API allows searching by URL pattern to find pages that historically
// exposed secrets.
type CommonCrawlSource struct {
	BaseURL  string
	Registry *providers.Registry
	Limiters *recon.LimiterRegistry
	Client   *Client
}

var _ recon.ReconSource = (*CommonCrawlSource)(nil)

func (s *CommonCrawlSource) Name() string              { return "commoncrawl" }
func (s *CommonCrawlSource) RateLimit() rate.Limit      { return rate.Every(5 * time.Second) }
func (s *CommonCrawlSource) Burst() int                 { return 1 }
func (s *CommonCrawlSource) RespectsRobots() bool       { return true }
func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }

// ccIndexResult represents a single Common Crawl CDX index record.
type ccIndexResult struct {
	URL       string `json:"url"`
	Timestamp string `json:"timestamp"`
	Status    string `json:"status"`
	Filename  string `json:"filename"`
	Length    string `json:"length"`
	Offset    string `json:"offset"`
}

func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
	base := s.BaseURL
	if base == "" {
		base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index"
	}
	client := s.Client
	if client == nil {
		client = NewClient()
	}

	queries := BuildQueries(s.Registry, "commoncrawl")
	if len(queries) == 0 {
		return nil
	}

	for _, q := range queries {
		if err := ctx.Err(); err != nil {
			return err
		}

		if s.Limiters != nil {
			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
				return err
			}
		}

		// CDX API: search for URLs matching the query.
		searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q)
		req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil)
		if err != nil {
			continue
		}
		req.Header.Set("Accept", "application/json")

		resp, err := client.Do(ctx, req)
		if err != nil {
			continue
		}

		body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024))
		_ = resp.Body.Close()
		if err != nil {
			continue
		}

		// Common Crawl returns NDJSON (newline-delimited JSON).
		// Parse each line as a separate JSON object.
		var results []ccIndexResult
		dec := json.NewDecoder(bytes.NewReader(body))
		for dec.More() {
			var r ccIndexResult
			if err := dec.Decode(&r); err != nil {
				break
			}
			results = append(results, r)
		}

		for _, r := range results {
			if err := ctx.Err(); err != nil {
				return err
			}

			// Each indexed URL is a potential leak location; emit as finding.
			out <- recon.Finding{
				ProviderName: q,
				Source:       r.URL,
				SourceType:   "recon:commoncrawl",
				Confidence:   "low",
				DetectedAt:   time.Now(),
			}
		}
	}
	return nil
}