diff --git a/pkg/recon/sources/queries.go b/pkg/recon/sources/queries.go new file mode 100644 index 0000000..6734054 --- /dev/null +++ b/pkg/recon/sources/queries.go @@ -0,0 +1,55 @@ +package sources + +import ( + "fmt" + "sort" + + "github.com/salvacybersec/keyhunter/pkg/providers" +) + +// BuildQueries produces the search-string list a source should iterate for the +// given provider registry. Each keyword is formatted per source-specific syntax +// (e.g. GitHub's `"kw" in:file` qualifier). Keywords are deduped across +// providers and returned sorted for deterministic test output. +// +// A nil registry returns nil. An unknown source name falls back to the bare +// keyword form so new sources work safely until they register custom syntax. +func BuildQueries(reg *providers.Registry, source string) []string { + if reg == nil { + return nil + } + + seen := make(map[string]struct{}) + for _, p := range reg.List() { + for _, k := range p.Keywords { + if k == "" { + continue + } + seen[k] = struct{}{} + } + } + + keywords := make([]string, 0, len(seen)) + for k := range seen { + keywords = append(keywords, k) + } + sort.Strings(keywords) + + out := make([]string, 0, len(keywords)) + for _, k := range keywords { + out = append(out, formatQuery(source, k)) + } + return out +} + +// formatQuery applies source-specific search syntax to a raw keyword. +func formatQuery(source, keyword string) string { + switch source { + case "github", "gist": + return fmt.Sprintf("%q in:file", keyword) + default: + // GitLab, Bitbucket, Codeberg, HuggingFace, Kaggle, Replit, + // CodeSandbox, sandboxes, and unknown sources use bare keywords. + return keyword + } +} diff --git a/pkg/recon/sources/queries_test.go b/pkg/recon/sources/queries_test.go new file mode 100644 index 0000000..d53095d --- /dev/null +++ b/pkg/recon/sources/queries_test.go @@ -0,0 +1,124 @@ +package sources + +import ( + "reflect" + "testing" + + "github.com/salvacybersec/keyhunter/pkg/providers" +) + +func testRegistry() *providers.Registry { + return providers.NewRegistryFromProviders([]providers.Provider{ + { + Name: "openai", + Keywords: []string{"sk-proj-", "sk-"}, + }, + { + Name: "anthropic", + Keywords: []string{"sk-ant-", "sk-"}, // shared "sk-" for dedup test + }, + { + Name: "empty-kw", + Keywords: []string{}, + }, + }) +} + +func TestBuildQueries_GitHub(t *testing.T) { + got := BuildQueries(testRegistry(), "github") + want := []string{ + `"sk-" in:file`, + `"sk-ant-" in:file`, + `"sk-proj-" in:file`, + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("github queries mismatch:\n got=%v\nwant=%v", got, want) + } +} + +func TestBuildQueries_Gist(t *testing.T) { + got := BuildQueries(testRegistry(), "gist") + if len(got) != 3 { + t.Fatalf("expected 3 queries, got %d: %v", len(got), got) + } + if got[0] != `"sk-" in:file` { + t.Fatalf("gist should use in:file syntax, got %q", got[0]) + } +} + +func TestBuildQueries_GitLab(t *testing.T) { + got := BuildQueries(testRegistry(), "gitlab") + want := []string{"sk-", "sk-ant-", "sk-proj-"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("gitlab queries mismatch:\n got=%v\nwant=%v", got, want) + } +} + +func TestBuildQueries_HuggingFace(t *testing.T) { + got := BuildQueries(testRegistry(), "huggingface") + want := []string{"sk-", "sk-ant-", "sk-proj-"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("huggingface queries mismatch:\n got=%v\nwant=%v", got, want) + } +} + +func TestBuildQueries_UnknownSourceDefault(t *testing.T) { + got := BuildQueries(testRegistry(), "mystery-source") + want := []string{"sk-", "sk-ant-", "sk-proj-"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("default queries mismatch:\n got=%v\nwant=%v", got, want) + } +} + +func TestBuildQueries_NilRegistry(t *testing.T) { + if got := BuildQueries(nil, "github"); got != nil { + t.Fatalf("expected nil for nil registry, got %v", got) + } +} + +func TestBuildQueries_DedupAcrossProviders(t *testing.T) { + // "sk-" is shared by openai and anthropic — should appear exactly once. + got := BuildQueries(testRegistry(), "gitlab") + count := 0 + for _, q := range got { + if q == "sk-" { + count++ + } + } + if count != 1 { + t.Fatalf("expected dedup of 'sk-' to 1 occurrence, got %d", count) + } +} + +func TestBuildQueries_EmptyKeywordsSkipped(t *testing.T) { + reg := providers.NewRegistryFromProviders([]providers.Provider{ + {Name: "only-empty", Keywords: []string{"", ""}}, + {Name: "real", Keywords: []string{"xyz"}}, + }) + got := BuildQueries(reg, "gitlab") + want := []string{"xyz"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("empty keywords should be skipped:\n got=%v\nwant=%v", got, want) + } +} + +func TestRegisterAll_NilEngineNoPanic(t *testing.T) { + defer func() { + if r := recover(); r != nil { + t.Fatalf("RegisterAll(nil, ...) panicked: %v", r) + } + }() + RegisterAll(nil, SourcesConfig{}) +} + +func TestRegisterAll_EmptyCfgNoPanic(t *testing.T) { + defer func() { + if r := recover(); r != nil { + t.Fatalf("RegisterAll panicked with empty cfg: %v", r) + } + }() + // Use a non-nil engine so the nil-guard early-return doesn't mask issues. + // Import cycle prevention: pkg/recon is imported by register.go. + eng := newTestEngine() + RegisterAll(eng, SourcesConfig{}) +} diff --git a/pkg/recon/sources/register.go b/pkg/recon/sources/register.go new file mode 100644 index 0000000..0422cfd --- /dev/null +++ b/pkg/recon/sources/register.go @@ -0,0 +1,32 @@ +package sources + +import ( + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/recon" +) + +// SourcesConfig carries per-source credentials and shared dependencies read +// from viper/env by cmd/recon.go. Plan 10-09 fleshes this out; for now it is a +// placeholder struct so Wave 2 plans can depend on its shape. +type SourcesConfig struct { + GitHubToken string + GitLabToken string + BitbucketToken string + HuggingFaceToken string + KaggleUser string + KaggleKey string + + Registry *providers.Registry + Limiters *recon.LimiterRegistry +} + +// RegisterAll registers every Phase 10 code-hosting source on engine. +// Wave 2 plans append their source constructors here via additional +// registerXxx helpers in this file. Plan 10-09 writes the final list. +func RegisterAll(engine *recon.Engine, cfg SourcesConfig) { + if engine == nil { + return + } + _ = cfg // wired up in Wave 2 + Plan 10-09 + // Populated by Plan 10-09 (after Wave 2 lands individual source files). +} diff --git a/pkg/recon/sources/testhelpers_test.go b/pkg/recon/sources/testhelpers_test.go new file mode 100644 index 0000000..bc9018b --- /dev/null +++ b/pkg/recon/sources/testhelpers_test.go @@ -0,0 +1,9 @@ +package sources + +import "github.com/salvacybersec/keyhunter/pkg/recon" + +// newTestEngine returns an empty recon.Engine for tests that need a non-nil +// engine to exercise registration paths without touching real sources. +func newTestEngine() *recon.Engine { + return recon.NewEngine() +}