feat(14-02): add WaybackMachine + CommonCrawl recon sources

- WaybackMachineSource queries CDX API for historical snapshots - CommonCrawlSource queries CC Index API for matching pages - Both credentialless, rate-limited at 1 req/5s, RespectsRobots=true - RegisterAll extended to 42 sources (40 Phase 10-13 + 2 Phase 14) - Full httptest-based test coverage for both sources
2026-04-06 13:16:13 +03:00
parent dc90785ab0
commit c5332454b0
7 changed files with 626 additions and 12 deletions
--- a/pkg/recon/sources/commoncrawl.go
+++ b/pkg/recon/sources/commoncrawl.go
@@ -0,0 +1,138 @@
+package sources
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"net/url"
+	"time"
+
+	"golang.org/x/time/rate"
+
+	"github.com/salvacybersec/keyhunter/pkg/providers"
+	"github.com/salvacybersec/keyhunter/pkg/recon"
+)
+
+// CommonCrawlSource implements recon.ReconSource against the CommonCrawl
+// Index Server API. It queries index.commoncrawl.org for pages matching
+// provider keywords in the CC index.
+//
+// RECON-ARCH-02: Each matching index record yields a Finding pointing at the
+// original URL discovered in the crawl. The source is credentialless and
+// always enabled.
+type CommonCrawlSource struct {
+	// BaseURL defaults to https://index.commoncrawl.org. Tests override with httptest URL.
+	BaseURL string
+	// IndexName defaults to CC-MAIN-2024-10 (recent crawl). Tests may override.
+	IndexName string
+	// Registry drives the keyword query list via BuildQueries.
+	Registry *providers.Registry
+	// Limiters is the shared recon.LimiterRegistry.
+	Limiters *recon.LimiterRegistry
+	// Client is the shared retry HTTP wrapper. If nil, a default is used.
+	Client *Client
+}
+
+// Compile-time assertion that CommonCrawlSource satisfies recon.ReconSource.
+var _ recon.ReconSource = (*CommonCrawlSource)(nil)
+
+func (s *CommonCrawlSource) Name() string          { return "commoncrawl" }
+func (s *CommonCrawlSource) RateLimit() rate.Limit { return rate.Every(5 * time.Second) }
+func (s *CommonCrawlSource) Burst() int            { return 1 }
+func (s *CommonCrawlSource) RespectsRobots() bool  { return true }
+
+// Enabled always returns true: CommonCrawl index is unauthenticated.
+func (s *CommonCrawlSource) Enabled(_ recon.Config) bool { return true }
+
+// Sweep iterates provider keywords, queries the CC index for each, and emits
+// a Finding for every matched URL. The CC Index API returns NDJSON (one JSON
+// object per line) with fields like url, timestamp, status, mime, etc.
+func (s *CommonCrawlSource) Sweep(ctx context.Context, _ string, out chan<- recon.Finding) error {
+	base := s.BaseURL
+	if base == "" {
+		base = "https://index.commoncrawl.org"
+	}
+	idx := s.IndexName
+	if idx == "" {
+		idx = "CC-MAIN-2024-10"
+	}
+	client := s.Client
+	if client == nil {
+		client = NewClient()
+	}
+
+	queries := BuildQueries(s.Registry, "commoncrawl")
+	if len(queries) == 0 {
+		return nil
+	}
+
+	for _, q := range queries {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+
+		if s.Limiters != nil {
+			if err := s.Limiters.Wait(ctx, s.Name(), s.RateLimit(), s.Burst(), false); err != nil {
+				return err
+			}
+		}
+
+		// CC Index API: output=json returns NDJSON, limit=50 bounds the response.
+		endpoint := fmt.Sprintf("%s/%s-index?url=*&output=json&limit=50&filter=status:200&query=%s",
+			base, idx, url.QueryEscape(q))
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
+		if err != nil {
+			return fmt.Errorf("commoncrawl: build req: %w", err)
+		}
+		req.Header.Set("Accept", "application/json")
+
+		resp, err := client.Do(ctx, req)
+		if err != nil {
+			// Non-fatal: skip this keyword on transient errors.
+			continue
+		}
+
+		scanner := bufio.NewScanner(resp.Body)
+		for scanner.Scan() {
+			line := scanner.Bytes()
+			if len(line) == 0 {
+				continue
+			}
+
+			var rec ccIndexRecord
+			if err := json.Unmarshal(line, &rec); err != nil {
+				continue
+			}
+			if rec.URL == "" {
+				continue
+			}
+
+			f := recon.Finding{
+				ProviderName: "",
+				Source:       rec.URL,
+				SourceType:   "recon:commoncrawl",
+				Confidence:   "low",
+				DetectedAt:   time.Now(),
+			}
+			select {
+			case out <- f:
+			case <-ctx.Done():
+				_ = resp.Body.Close()
+				return ctx.Err()
+			}
+		}
+		_ = resp.Body.Close()
+	}
+	return nil
+}
+
+// ccIndexRecord mirrors the subset of fields returned by the CommonCrawl Index
+// API that this source consumes. Additional fields (mime, status, digest, etc.)
+// are ignored to keep the decoder tolerant.
+type ccIndexRecord struct {
+	URL       string `json:"url"`
+	Timestamp string `json:"timestamp"`
+	Status    string `json:"status"`
+}