package recon import ( "context" "net/http" "net/http/httptest" "testing" "time" "github.com/stretchr/testify/require" "golang.org/x/time/rate" ) // testSource is an in-test ReconSource that emits a deterministic mix of // unique and duplicate findings so we can exercise Engine + Dedup end-to-end. type testSource struct{} func (testSource) Name() string { return "test" } func (testSource) RateLimit() rate.Limit { return rate.Limit(100) } func (testSource) Burst() int { return 10 } func (testSource) RespectsRobots() bool { return false } func (testSource) Enabled(_ Config) bool { return true } // Sweep emits 5 findings total, of which 2 are exact duplicates of each // other (same ProviderName + KeyMasked + Source). After Dedup, 4 unique // findings should remain (one duplicate pair collapses to a single entry). func (testSource) Sweep(ctx context.Context, _ string, out chan<- Finding) error { now := time.Now() base := []Finding{ {ProviderName: "openai", KeyMasked: "sk-aaaa...1111", Source: "https://test.invalid/a", SourceType: "recon:test", DetectedAt: now}, {ProviderName: "anthropic", KeyMasked: "sk-ant-b...2222", Source: "https://test.invalid/b", SourceType: "recon:test", DetectedAt: now}, {ProviderName: "openai", KeyMasked: "sk-cccc...3333", Source: "https://test.invalid/c", SourceType: "recon:test", DetectedAt: now}, {ProviderName: "cohere", KeyMasked: "co-dddd...4444", Source: "https://test.invalid/d", SourceType: "recon:test", DetectedAt: now}, // Exact duplicate of index 0 — provider|masked|source all match. {ProviderName: "openai", KeyMasked: "sk-aaaa...1111", Source: "https://test.invalid/a", SourceType: "recon:test", DetectedAt: now}, } for _, f := range base { select { case out <- f: case <-ctx.Done(): return ctx.Err() } } return nil } // testWebSource mirrors testSource but advertises RespectsRobots()==true so // the robots-gated code path can be asserted. type testWebSource struct{} func (testWebSource) Name() string { return "testweb" } func (testWebSource) RateLimit() rate.Limit { return rate.Limit(50) } func (testWebSource) Burst() int { return 5 } func (testWebSource) RespectsRobots() bool { return true } func (testWebSource) Enabled(_ Config) bool { return true } func (testWebSource) Sweep(ctx context.Context, _ string, out chan<- Finding) error { return nil } // TestReconPipelineIntegration wires Engine + LimiterRegistry + Stealth + Dedup // together against a synthetic source and asserts the full flow. // // Covers: // - RECON-INFRA-05: LimiterRegistry.Wait with jitter path returns without error // - RECON-INFRA-06: Stealth=true is threaded through SweepAll and RandomUserAgent works // - RECON-INFRA-08: Engine parallel fanout produces aggregated findings; Dedup trims them func TestReconPipelineIntegration(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() // Build the engine and register the synthetic source. e := NewEngine() e.Register(testSource{}) require.Equal(t, []string{"test"}, e.List()) // Exercise the limiter jitter path once (RECON-INFRA-05 + 06 partial). limiter := NewLimiterRegistry() require.NoError(t, limiter.Wait(ctx, "test", rate.Limit(100), 10, true)) // Stealth header helper must return a UA from the pool (RECON-INFRA-06). headers := StealthHeaders() require.NotEmpty(t, headers["User-Agent"]) require.Contains(t, userAgents, headers["User-Agent"]) // Fan out via Engine (RECON-INFRA-08). Stealth flag is threaded in cfg. raw, err := e.SweepAll(ctx, Config{Stealth: true}) require.NoError(t, err) require.Equal(t, 5, len(raw), "testSource must emit exactly 5 raw findings") // Every finding should be tagged with a recon: SourceType prefix. for _, f := range raw { require.Equal(t, "recon:test", f.SourceType) } // Dedup must collapse the two duplicates down to 4 unique findings. deduped := Dedup(raw) require.Equal(t, 4, len(deduped), "Dedup must collapse the two exact duplicates") } // TestRobotsOnlyWhenRespectsRobots asserts that the RobotsCache code path is // gated by ReconSource.RespectsRobots() and that RobotsCache.Allowed returns // true for a permissive robots.txt served from an httptest server // (RECON-INFRA-07). func TestRobotsOnlyWhenRespectsRobots(t *testing.T) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) _, _ = w.Write([]byte("User-agent: *\nAllow: /\n")) })) defer server.Close() web := testWebSource{} api := testSource{} require.True(t, web.RespectsRobots(), "web scrapers must opt into robots") require.False(t, api.RespectsRobots(), "API sources must skip robots") rc := NewRobotsCache() rc.Client = server.Client() ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() // The robots path is only exercised for sources whose RespectsRobots()==true. // We invoke it directly for the web source to prove it works end-to-end. allowed, err := rc.Allowed(ctx, server.URL+"/foo") require.NoError(t, err) require.True(t, allowed, "permissive robots.txt must allow /foo") // For the API source we intentionally do NOT call rc.Allowed — mirroring // the real Engine behavior where RespectsRobots()==false skips the check. // Trivially satisfied: we simply never invoke the cache here. }