package sources import ( "context" "net/http" "net/http/httptest" "testing" "time" "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/recon" ) func TestCommonCrawl_Name(t *testing.T) { s := &CommonCrawlSource{} if s.Name() != "commoncrawl" { t.Fatalf("expected commoncrawl, got %s", s.Name()) } } func TestCommonCrawl_Enabled(t *testing.T) { s := &CommonCrawlSource{} if !s.Enabled(recon.Config{}) { t.Fatal("CommonCrawlSource should always be enabled (credentialless)") } } func TestCommonCrawl_Sweep(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") // NDJSON format: one JSON object per line. _, _ = w.Write([]byte(`{"url":"https://example.com/.env","timestamp":"20240101000000","status":"200","filename":"CC-MAIN-2024.warc.gz","length":"1234","offset":"5678"} `)) }) srv := httptest.NewServer(mux) defer srv.Close() reg := providers.NewRegistryFromProviders([]providers.Provider{ {Name: "openai", Keywords: []string{"sk-proj-"}}, }) s := &CommonCrawlSource{ BaseURL: srv.URL, Registry: reg, Client: NewClient(), } out := make(chan recon.Finding, 10) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() err := s.Sweep(ctx, "", out) close(out) if err != nil { t.Fatalf("Sweep error: %v", err) } var findings []recon.Finding for f := range out { findings = append(findings, f) } if len(findings) == 0 { t.Fatal("expected at least one finding from Common Crawl index") } if findings[0].SourceType != "recon:commoncrawl" { t.Fatalf("expected recon:commoncrawl, got %s", findings[0].SourceType) } }