feat(10-01): add provider-driven query generator and RegisterAll skeleton

- BuildQueries(reg, source) dedups keywords and formats per-source syntax
- github/gist use 'keyword' in:file; others use bare keyword
- SourcesConfig placeholder struct for Wave 2 plans to depend on
- RegisterAll no-op stub (Plan 10-09 will fill)
This commit is contained in:
salvacybersec
2026-04-06 01:09:57 +03:00
parent 75024e4701
commit 9273f356e6
4 changed files with 220 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
package sources
import (
"fmt"
"sort"
"github.com/salvacybersec/keyhunter/pkg/providers"
)
// BuildQueries produces the search-string list a source should iterate for the
// given provider registry. Each keyword is formatted per source-specific syntax
// (e.g. GitHub's `"kw" in:file` qualifier). Keywords are deduped across
// providers and returned sorted for deterministic test output.
//
// A nil registry returns nil. An unknown source name falls back to the bare
// keyword form so new sources work safely until they register custom syntax.
func BuildQueries(reg *providers.Registry, source string) []string {
if reg == nil {
return nil
}
seen := make(map[string]struct{})
for _, p := range reg.List() {
for _, k := range p.Keywords {
if k == "" {
continue
}
seen[k] = struct{}{}
}
}
keywords := make([]string, 0, len(seen))
for k := range seen {
keywords = append(keywords, k)
}
sort.Strings(keywords)
out := make([]string, 0, len(keywords))
for _, k := range keywords {
out = append(out, formatQuery(source, k))
}
return out
}
// formatQuery applies source-specific search syntax to a raw keyword.
func formatQuery(source, keyword string) string {
switch source {
case "github", "gist":
return fmt.Sprintf("%q in:file", keyword)
default:
// GitLab, Bitbucket, Codeberg, HuggingFace, Kaggle, Replit,
// CodeSandbox, sandboxes, and unknown sources use bare keywords.
return keyword
}
}

View File

@@ -0,0 +1,124 @@
package sources
import (
"reflect"
"testing"
"github.com/salvacybersec/keyhunter/pkg/providers"
)
func testRegistry() *providers.Registry {
return providers.NewRegistryFromProviders([]providers.Provider{
{
Name: "openai",
Keywords: []string{"sk-proj-", "sk-"},
},
{
Name: "anthropic",
Keywords: []string{"sk-ant-", "sk-"}, // shared "sk-" for dedup test
},
{
Name: "empty-kw",
Keywords: []string{},
},
})
}
func TestBuildQueries_GitHub(t *testing.T) {
got := BuildQueries(testRegistry(), "github")
want := []string{
`"sk-" in:file`,
`"sk-ant-" in:file`,
`"sk-proj-" in:file`,
}
if !reflect.DeepEqual(got, want) {
t.Fatalf("github queries mismatch:\n got=%v\nwant=%v", got, want)
}
}
func TestBuildQueries_Gist(t *testing.T) {
got := BuildQueries(testRegistry(), "gist")
if len(got) != 3 {
t.Fatalf("expected 3 queries, got %d: %v", len(got), got)
}
if got[0] != `"sk-" in:file` {
t.Fatalf("gist should use in:file syntax, got %q", got[0])
}
}
func TestBuildQueries_GitLab(t *testing.T) {
got := BuildQueries(testRegistry(), "gitlab")
want := []string{"sk-", "sk-ant-", "sk-proj-"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("gitlab queries mismatch:\n got=%v\nwant=%v", got, want)
}
}
func TestBuildQueries_HuggingFace(t *testing.T) {
got := BuildQueries(testRegistry(), "huggingface")
want := []string{"sk-", "sk-ant-", "sk-proj-"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("huggingface queries mismatch:\n got=%v\nwant=%v", got, want)
}
}
func TestBuildQueries_UnknownSourceDefault(t *testing.T) {
got := BuildQueries(testRegistry(), "mystery-source")
want := []string{"sk-", "sk-ant-", "sk-proj-"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("default queries mismatch:\n got=%v\nwant=%v", got, want)
}
}
func TestBuildQueries_NilRegistry(t *testing.T) {
if got := BuildQueries(nil, "github"); got != nil {
t.Fatalf("expected nil for nil registry, got %v", got)
}
}
func TestBuildQueries_DedupAcrossProviders(t *testing.T) {
// "sk-" is shared by openai and anthropic — should appear exactly once.
got := BuildQueries(testRegistry(), "gitlab")
count := 0
for _, q := range got {
if q == "sk-" {
count++
}
}
if count != 1 {
t.Fatalf("expected dedup of 'sk-' to 1 occurrence, got %d", count)
}
}
func TestBuildQueries_EmptyKeywordsSkipped(t *testing.T) {
reg := providers.NewRegistryFromProviders([]providers.Provider{
{Name: "only-empty", Keywords: []string{"", ""}},
{Name: "real", Keywords: []string{"xyz"}},
})
got := BuildQueries(reg, "gitlab")
want := []string{"xyz"}
if !reflect.DeepEqual(got, want) {
t.Fatalf("empty keywords should be skipped:\n got=%v\nwant=%v", got, want)
}
}
func TestRegisterAll_NilEngineNoPanic(t *testing.T) {
defer func() {
if r := recover(); r != nil {
t.Fatalf("RegisterAll(nil, ...) panicked: %v", r)
}
}()
RegisterAll(nil, SourcesConfig{})
}
func TestRegisterAll_EmptyCfgNoPanic(t *testing.T) {
defer func() {
if r := recover(); r != nil {
t.Fatalf("RegisterAll panicked with empty cfg: %v", r)
}
}()
// Use a non-nil engine so the nil-guard early-return doesn't mask issues.
// Import cycle prevention: pkg/recon is imported by register.go.
eng := newTestEngine()
RegisterAll(eng, SourcesConfig{})
}

View File

@@ -0,0 +1,32 @@
package sources
import (
"github.com/salvacybersec/keyhunter/pkg/providers"
"github.com/salvacybersec/keyhunter/pkg/recon"
)
// SourcesConfig carries per-source credentials and shared dependencies read
// from viper/env by cmd/recon.go. Plan 10-09 fleshes this out; for now it is a
// placeholder struct so Wave 2 plans can depend on its shape.
type SourcesConfig struct {
GitHubToken string
GitLabToken string
BitbucketToken string
HuggingFaceToken string
KaggleUser string
KaggleKey string
Registry *providers.Registry
Limiters *recon.LimiterRegistry
}
// RegisterAll registers every Phase 10 code-hosting source on engine.
// Wave 2 plans append their source constructors here via additional
// registerXxx helpers in this file. Plan 10-09 writes the final list.
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
if engine == nil {
return
}
_ = cfg // wired up in Wave 2 + Plan 10-09
// Populated by Plan 10-09 (after Wave 2 lands individual source files).
}

View File

@@ -0,0 +1,9 @@
package sources
import "github.com/salvacybersec/keyhunter/pkg/recon"
// newTestEngine returns an empty recon.Engine for tests that need a non-nil
// engine to exercise registration paths without touching real sources.
func newTestEngine() *recon.Engine {
return recon.NewEngine()
}