package sources import ( "bytes" "context" "encoding/json" "fmt" "io" "net/http" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // CommonCrawlSource searches the Common Crawl index for web pages that may // contain leaked API keys. Common Crawl archives petabytes of web content; // its CDX API allows searching by URL pattern to find pages that historically // exposed secrets. type CommonCrawlSource struct { BaseURL string Registry *providers.Registry Limiters *recon.LimiterRegistry Client *Client } var _ recon.ReconSource = (*CommonCrawlSource)(nil) func (s *CommonCrawlSource) Name() string { return "commoncrawl" } func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } func (s *CommonCrawlSource) Burst() int { return 1 } func (s *CommonCrawlSource) RespectsRobots() bool { return true } func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true } // ccIndexResult represents a single Common Crawl CDX index record. type ccIndexResult struct { URL string `json:"url"` Timestamp string `json:"timestamp"` Status string `json:"status"` Filename string `json:"filename"` Length string `json:"length"` Offset string `json:"offset"` } func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL if base == "" { base = "https://index.commoncrawl.org/CC-MAIN-2024-10-index" } client := s.Client if client == nil { client = NewClient() } queries := BuildQueries(s.Registry, "commoncrawl") if len(queries) == 0 { return nil } for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } // CDX API: search for URLs matching the query. searchURL := fmt.Sprintf("%s?url=*%s*&output=json&limit=10", base, q) req, err := http.NewRequestWithContext(ctx, http.MethodGet, searchURL, nil) if err != nil { continue } req.Header.Set("Accept", "application/json") resp, err := client.Do(ctx, req) if err != nil { continue } body, err := io.ReadAll(io.LimitReader(resp.Body, 128*1024)) _ = resp.Body.Close() if err != nil { continue } // Common Crawl returns NDJSON (newline-delimited JSON). // Parse each line as a separate JSON object. var results []ccIndexResult dec := json.NewDecoder(bytes.NewReader(body)) for dec.More() { var r ccIndexResult if err := dec.Decode(&r); err != nil { break } results = append(results, r) } for _, r := range results { if err := ctx.Err(); err != nil { return err } // Each indexed URL is a potential leak location; emit as finding. out <- recon.Finding{ ProviderName: q, Source: r.URL, SourceType: "recon:commoncrawl", Confidence: "low", DetectedAt: time.Now(), } } } return nil }