package sources import ( "bufio" "context" "encoding/json" "fmt" "net/http" "net/url" "time" "golang.org/x/time/rate" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) // CommonCrawlSource implements recon.ReconSource against the CommonCrawl // Index Server API. It queries index.commoncrawl.org for pages matching // provider keywords in the CC index. // // RECON-ARCH-02: Each matching index record yields a Finding pointing at the // original URL discovered in the crawl. The source is credentialless and // always enabled. type CommonCrawlSource struct { // BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL. BaseURL string // IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override. IndexName string // Registry drives the keyword query list via BuildQueries. Registry *providers.Registry // Limiters is the shared recon.LimiterRegistry. Limiters *recon.LimiterRegistry // Client is the shared retry HTTP wrapper. If nil, a default is used. Client *Client } // Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource. var _ recon.ReconSource = (*CommonCrawlSource)(nil) func (s *CommonCrawlSource) Name() string { return "commoncrawl" } func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) } func (s *CommonCrawlSource) Burst() int { return 1 } func (s *CommonCrawlSource) RespectsRobots() bool { return true } // Enabled always returns true: CommonCrawl index is unauthenticated. func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true } // Sweep iterates provider keywords, queries the CC index for each, and emits // a Finding for every matched URL. The CC Index API returns NDJSON (one JSON // object per line) with fields like url, timestamp, status, mime, etc. func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error { base := s.BaseURL if base == "" { base = "https://index.commoncrawl.org" } idx := s.IndexName if idx == "" { idx = "CC-MAIN-2024-10" } client := s.Client if client == nil { client = NewClient() } queries := BuildQueries(s.Registry, "commoncrawl") if len(queries) == 0 { return nil } for _, q := range queries { if err := ctx.Err(); err != nil { return err } if s.Limiters != nil { if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil { return err } } // CC Index API: output=json returns NDJSON, limit=50 bounds the response. endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s", base, idx, url.QueryEscape(q)) req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { return fmt.Errorf("commoncrawl: build req: %w", err) } req.Header.Set("Accept", "application/json") resp, err := client.Do(ctx, req) if err != nil { // Non-fatal: skip this keyword on transient errors. continue } scanner := bufio.NewScanner(resp.Body) for scanner.Scan() { line := scanner.Bytes() if len(line) == 0 { continue } var rec ccIndexRecord if err := json.Unmarshal(line, &rec); err != nil { continue } if rec.URL == "" { continue } f := recon.Finding{ ProviderName: "", Source: rec.URL, SourceType: "recon:commoncrawl", Confidence: "low", DetectedAt: time.Now(), } select { case out <- f: case <-ctx.Done(): _ = resp.Body.Close() return ctx.Err() } } _ = resp.Body.Close() } return nil } // ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index // API that this source consumes. Additional fields (mime, status, digest, etc.) // are ignored to keep the decoder tolerant. type ccIndexRecord struct { URL string `json:"url"` Timestamp string `json:"timestamp"` Status string `json:"status"` }