diff --git a/pkg/recon/sources/github_test.go b/pkg/recon/sources/github_test.go new file mode 100644 index 0000000..cbe8ce6 --- /dev/null +++ b/pkg/recon/sources/github_test.go @@ -0,0 +1,227 @@ +package sources + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// syntheticRegistry builds a two-provider registry for tests. +func syntheticRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "openai", Keywords: []string{"sk-proj-"}}, + {Name: "anthropic", Keywords: []string{"sk-ant-"}}, + }) +} + +// ghStubHandler returns a handler that echoes the query back in two items. +func ghStubHandler(t *testing.T, calls *int32) http.HandlerFunc { + t.Helper() + return func(w http.ResponseWriter, r *http.Request) { + atomic.AddInt32(calls, 1) + if r.URL.Path != "/search/code" { + t.Errorf("unexpected path: %s", r.URL.Path) + } + if got := r.Header.Get("Authorization"); got != "Bearer testtoken" { + t.Errorf("missing bearer token: %q", got) + } + if got := r.Header.Get("Accept"); !strings.Contains(got, "text-match") { + t.Errorf("missing text-match accept header: %q", got) + } + q := r.URL.Query().Get("q") + body := map[string]any{ + "items": []map[string]any{ + { + "html_url": "https://github.com/org/repo/blob/main/a.env#" + q, + "repository": map[string]any{"full_name": "org/repo"}, + "text_matches": []map[string]any{ + {"fragment": "snippet for " + q}, + }, + }, + { + "html_url": "https://github.com/org/repo/blob/main/b.env#" + q, + "repository": map[string]any{"full_name": "org/repo"}, + }, + }, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(body) + } +} + +func TestGitHubSource_EnabledReflectsToken(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + + if s := NewGitHubSource("", reg, lim); s.Enabled(recon.Config{}) { + t.Errorf("expected Enabled=false with empty token") + } + if s := NewGitHubSource("tok", reg, lim); !s.Enabled(recon.Config{}) { + t.Errorf("expected Enabled=true with token") + } +} + +func TestGitHubSource_SweepEmptyTokenReturnsNil(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + s := NewGitHubSource("", reg, lim) + + out := make(chan recon.Finding, 10) + if err := s.Sweep(context.Background(), "", out); err != nil { + t.Fatalf("expected nil err, got %v", err) + } + close(out) + if n := countFindings(out); n != 0 { + t.Fatalf("expected 0 findings, got %d", n) + } +} + +func TestGitHubSource_SweepEmitsFindings(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + + var calls int32 + srv := httptest.NewServer(ghStubHandler(t, &calls)) + defer srv.Close() + + s := NewGitHubSource("testtoken", reg, lim) + s.BaseURL = srv.URL + // Use a generous limiter so the test doesn't wait seconds for each query. + // Overwrite via the limiter registry by pre-registering at high rate. + _ = lim.For(s.Name(), 1000, 100) + + out := make(chan recon.Finding, 32) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + done := make(chan error, 1) + go func() { done <- s.Sweep(ctx, "", out); close(out) }() + + var findings []recon.Finding + for f := range out { + findings = append(findings, f) + } + if err := <-done; err != nil { + t.Fatalf("Sweep error: %v", err) + } + + // syntheticRegistry has 2 keywords -> 2 queries -> 2 items each = 4 findings. + if len(findings) != 4 { + t.Fatalf("expected 4 findings, got %d", len(findings)) + } + for _, f := range findings { + if f.SourceType != "recon:github" { + t.Errorf("SourceType=%q want recon:github", f.SourceType) + } + if !strings.HasPrefix(f.Source, "https://github.com/org/repo/blob/main/") { + t.Errorf("Source=%q unexpected", f.Source) + } + if f.ProviderName != "openai" && f.ProviderName != "anthropic" { + t.Errorf("ProviderName=%q unexpected", f.ProviderName) + } + } + + // Two queries -> two calls. + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 calls, got %d", got) + } +} + +func TestGitHubSource_ProviderNameFromKeyword(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("github", 1000, 100) + + // Track which query came in so we can assert providerName mapping per-call. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + q := r.URL.Query().Get("q") + body := fmt.Sprintf(`{"items":[{"html_url":"https://example/%s","repository":{"full_name":"o/r"}}]}`, q) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(body)) + })) + defer srv.Close() + + s := NewGitHubSource("tok", reg, lim) + s.BaseURL = srv.URL + + out := make(chan recon.Finding, 8) + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + if err := s.Sweep(ctx, "", out); err != nil { + t.Fatalf("Sweep: %v", err) + } + close(out) + + // BuildQueries returns sorted keywords: sk-ant- then sk-proj-. + // So findings should arrive in that order. + var got []string + for f := range out { + got = append(got, f.ProviderName) + } + if len(got) != 2 { + t.Fatalf("expected 2 findings, got %d", len(got)) + } + if got[0] != "anthropic" || got[1] != "openai" { + t.Errorf("expected sorted provider names [anthropic openai], got %v", got) + } +} + +func TestGitHubSource_CtxCancelled(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("github", 1000, 100) + + s := NewGitHubSource("tok", reg, lim) + s.BaseURL = "http://127.0.0.1:1" // unused, should never be reached + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + out := make(chan recon.Finding, 1) + err := s.Sweep(ctx, "", out) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled, got %v", err) + } +} + +func TestGitHubSource_Unauthorized(t *testing.T) { + reg := syntheticRegistry() + lim := recon.NewLimiterRegistry() + _ = lim.For("github", 1000, 100) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = w.Write([]byte("bad token")) + })) + defer srv.Close() + + s := NewGitHubSource("tok", reg, lim) + s.BaseURL = srv.URL + + out := make(chan recon.Finding, 1) + err := s.Sweep(context.Background(), "", out) + if !errors.Is(err, ErrUnauthorized) { + t.Fatalf("expected ErrUnauthorized, got %v", err) + } +} + +// Compile-time assertion that GitHubSource satisfies recon.ReconSource. +var _ recon.ReconSource = (*GitHubSource)(nil) + +func countFindings(ch <-chan recon.Finding) int { + n := 0 + for range ch { + n++ + } + return n +}