package sources import ( "context" "encoding/json" "errors" "net/http" "net/http/httptest" "sync/atomic" "testing" "time" "github.com/salvacybersec/keyhunter/pkg/recon" ) func commonCrawlStubHandler(t *testing.T, calls *int32) http.HandlerFunc { t.Helper() return func(w http.ResponseWriter, r *http.Request) { atomic.AddInt32(calls, 1) if r.URL.Query().Get("query") == "" { t.Errorf("missing query param") } w.Header().Set("Content-Type", "application/json") // NDJSON: one JSON object per line enc := json.NewEncoder(w) _ = enc.Encode(ccIndexRecord{URL: "https://example.com/api/config", Timestamp: "20240301120000", Status: "200"}) _ = enc.Encode(ccIndexRecord{URL: "https://example.com/env.js", Timestamp: "20240301130000", Status: "200"}) } } func TestCommonCrawl_SweepEmitsFindings(t *testing.T) { reg := syntheticRegistry() lim := recon.NewLimiterRegistry() _ = lim.For("commoncrawl", 1000, 100) var calls int32 srv := httptest.NewServer(commonCrawlStubHandler(t, &calls)) defer srv.Close() src := &CommonCrawlSource{ BaseURL: srv.URL, IndexName: "CC-MAIN-2024-10", Registry: reg, Limiters: lim, Client: NewClient(), } out := make(chan recon.Finding, 32) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() done := make(chan error, 1) go func() { done <- src.Sweep(ctx, "", out); close(out) }() var findings []recon.Finding for f := range out { findings = append(findings, f) } if err := <-done; err != nil { t.Fatalf("Sweep error: %v", err) } // 2 keywords * 2 results = 4 findings if len(findings) != 4 { t.Fatalf("expected 4 findings, got %d", len(findings)) } for _, f := range findings { if f.SourceType != "recon:commoncrawl" { t.Errorf("SourceType=%q want recon:commoncrawl", f.SourceType) } } if got := atomic.LoadInt32(&calls); got != 2 { t.Errorf("expected 2 server calls, got %d", got) } } func TestCommonCrawl_FindingURLs(t *testing.T) { reg := syntheticRegistry() lim := recon.NewLimiterRegistry() _ = lim.For("commoncrawl", 1000, 100) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") enc := json.NewEncoder(w) _ = enc.Encode(ccIndexRecord{URL: "https://target.com/leak.js", Timestamp: "20240101000000", Status: "200"}) })) defer srv.Close() src := &CommonCrawlSource{ BaseURL: srv.URL, IndexName: "CC-MAIN-2024-10", Registry: reg, Limiters: lim, Client: NewClient(), } out := make(chan recon.Finding, 32) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() done := make(chan error, 1) go func() { done <- src.Sweep(ctx, "", out); close(out) }() var findings []recon.Finding for f := range out { findings = append(findings, f) } if err := <-done; err != nil { t.Fatalf("Sweep error: %v", err) } for _, f := range findings { if f.Source != "https://target.com/leak.js" { t.Errorf("Source=%q want https://target.com/leak.js", f.Source) } } } func TestCommonCrawl_EnabledAlwaysTrue(t *testing.T) { s := &CommonCrawlSource{} if !s.Enabled(recon.Config{}) { t.Fatal("expected Enabled=true") } } func TestCommonCrawl_NameAndRate(t *testing.T) { s := &CommonCrawlSource{} if s.Name() != "commoncrawl" { t.Errorf("unexpected name: %s", s.Name()) } if s.Burst() != 1 { t.Errorf("burst: %d", s.Burst()) } if !s.RespectsRobots() { t.Error("expected RespectsRobots=true") } } func TestCommonCrawl_CtxCancelled(t *testing.T) { reg := syntheticRegistry() lim := recon.NewLimiterRegistry() _ = lim.For("commoncrawl", 1000, 100) src := &CommonCrawlSource{ BaseURL: "http://127.0.0.1:1", IndexName: "CC-MAIN-2024-10", Registry: reg, Limiters: lim, Client: NewClient(), } ctx, cancel := context.WithCancel(context.Background()) cancel() out := make(chan recon.Finding, 1) err := src.Sweep(ctx, "", out) if !errors.Is(err, context.Canceled) { t.Fatalf("expected context.Canceled, got %v", err) } } func TestCommonCrawl_NilRegistryNoError(t *testing.T) { src := &CommonCrawlSource{Client: NewClient()} out := make(chan recon.Finding, 1) if err := src.Sweep(context.Background(), "", out); err != nil { t.Fatalf("expected nil, got %v", err) } }