Files
keyhunter/pkg/recon/integration_test.go
salvacybersec a754ff7546 test(09-06): add recon pipeline integration test
- Exercises Engine + LimiterRegistry + Stealth + Dedup end-to-end
- testSource emits 5 findings with one duplicate pair (Dedup -> 4)
- TestRobotsOnlyWhenRespectsRobots asserts robots gating via httptest
- Covers RECON-INFRA-05/06/07/08
2026-04-06 00:51:08 +03:00

132 lines
5.3 KiB
Go

package recon
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/stretchr/testify/require"
"golang.org/x/time/rate"
)
// testSource is an in-test ReconSource that emits a deterministic mix of
// unique and duplicate findings so we can exercise Engine + Dedup end-to-end.
type testSource struct{}
func (testSource) Name() string { return "test" }
func (testSource) RateLimit() rate.Limit { return rate.Limit(100) }
func (testSource) Burst() int { return 10 }
func (testSource) RespectsRobots() bool { return false }
func (testSource) Enabled(_ Config) bool { return true }
// Sweep emits 5 findings total, of which 2 are exact duplicates of each
// other (same ProviderName + KeyMasked + Source). After Dedup, 4 unique
// findings should remain (one duplicate pair collapses to a single entry).
func (testSource) Sweep(ctx context.Context, _ string, out chan<- Finding) error {
now := time.Now()
base := []Finding{
{ProviderName: "openai", KeyMasked: "sk-aaaa...1111", Source: "https://test.invalid/a", SourceType: "recon:test", DetectedAt: now},
{ProviderName: "anthropic", KeyMasked: "sk-ant-b...2222", Source: "https://test.invalid/b", SourceType: "recon:test", DetectedAt: now},
{ProviderName: "openai", KeyMasked: "sk-cccc...3333", Source: "https://test.invalid/c", SourceType: "recon:test", DetectedAt: now},
{ProviderName: "cohere", KeyMasked: "co-dddd...4444", Source: "https://test.invalid/d", SourceType: "recon:test", DetectedAt: now},
// Exact duplicate of index 0 — provider|masked|source all match.
{ProviderName: "openai", KeyMasked: "sk-aaaa...1111", Source: "https://test.invalid/a", SourceType: "recon:test", DetectedAt: now},
}
for _, f := range base {
select {
case out <- f:
case <-ctx.Done():
return ctx.Err()
}
}
return nil
}
// testWebSource mirrors testSource but advertises RespectsRobots()==true so
// the robots-gated code path can be asserted.
type testWebSource struct{}
func (testWebSource) Name() string { return "testweb" }
func (testWebSource) RateLimit() rate.Limit { return rate.Limit(50) }
func (testWebSource) Burst() int { return 5 }
func (testWebSource) RespectsRobots() bool { return true }
func (testWebSource) Enabled(_ Config) bool { return true }
func (testWebSource) Sweep(ctx context.Context, _ string, out chan<- Finding) error {
return nil
}
// TestReconPipelineIntegration wires Engine + LimiterRegistry + Stealth + Dedup
// together against a synthetic source and asserts the full flow.
//
// Covers:
// - RECON-INFRA-05: LimiterRegistry.Wait with jitter path returns without error
// - RECON-INFRA-06: Stealth=true is threaded through SweepAll and RandomUserAgent works
// - RECON-INFRA-08: Engine parallel fanout produces aggregated findings; Dedup trims them
func TestReconPipelineIntegration(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// Build the engine and register the synthetic source.
e := NewEngine()
e.Register(testSource{})
require.Equal(t, []string{"test"}, e.List())
// Exercise the limiter jitter path once (RECON-INFRA-05 + 06 partial).
limiter := NewLimiterRegistry()
require.NoError(t, limiter.Wait(ctx, "test", rate.Limit(100), 10, true))
// Stealth header helper must return a UA from the pool (RECON-INFRA-06).
headers := StealthHeaders()
require.NotEmpty(t, headers["User-Agent"])
require.Contains(t, userAgents, headers["User-Agent"])
// Fan out via Engine (RECON-INFRA-08). Stealth flag is threaded in cfg.
raw, err := e.SweepAll(ctx, Config{Stealth: true})
require.NoError(t, err)
require.Equal(t, 5, len(raw), "testSource must emit exactly 5 raw findings")
// Every finding should be tagged with a recon: SourceType prefix.
for _, f := range raw {
require.Equal(t, "recon:test", f.SourceType)
}
// Dedup must collapse the two duplicates down to 4 unique findings.
deduped := Dedup(raw)
require.Equal(t, 4, len(deduped), "Dedup must collapse the two exact duplicates")
}
// TestRobotsOnlyWhenRespectsRobots asserts that the RobotsCache code path is
// gated by ReconSource.RespectsRobots() and that RobotsCache.Allowed returns
// true for a permissive robots.txt served from an httptest server
// (RECON-INFRA-07).
func TestRobotsOnlyWhenRespectsRobots(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("User-agent: *\nAllow: /\n"))
}))
defer server.Close()
web := testWebSource{}
api := testSource{}
require.True(t, web.RespectsRobots(), "web scrapers must opt into robots")
require.False(t, api.RespectsRobots(), "API sources must skip robots")
rc := NewRobotsCache()
rc.Client = server.Client()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// The robots path is only exercised for sources whose RespectsRobots()==true.
// We invoke it directly for the web source to prove it works end-to-end.
allowed, err := rc.Allowed(ctx, server.URL+"/foo")
require.NoError(t, err)
require.True(t, allowed, "permissive robots.txt must allow /foo")
// For the API source we intentionally do NOT call rc.Allowed — mirroring
// the real Engine behavior where RespectsRobots()==false skips the check.
// Trivially satisfied: we simply never invoke the cache here.
}