feat(10-01): add provider-driven query generator and RegisterAll skeleton
- BuildQueries(reg, source) dedups keywords and formats per-source syntax - github/gist use 'keyword' in:file; others use bare keyword - SourcesConfig placeholder struct for Wave 2 plans to depend on - RegisterAll no-op stub (Plan 10-09 will fill)
This commit is contained in:
55
pkg/recon/sources/queries.go
Normal file
55
pkg/recon/sources/queries.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
)
|
||||
|
||||
// BuildQueries produces the search-string list a source should iterate for the
|
||||
// given provider registry. Each keyword is formatted per source-specific syntax
|
||||
// (e.g. GitHub's `"kw" in:file` qualifier). Keywords are deduped across
|
||||
// providers and returned sorted for deterministic test output.
|
||||
//
|
||||
// A nil registry returns nil. An unknown source name falls back to the bare
|
||||
// keyword form so new sources work safely until they register custom syntax.
|
||||
func BuildQueries(reg *providers.Registry, source string) []string {
|
||||
if reg == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
seen := make(map[string]struct{})
|
||||
for _, p := range reg.List() {
|
||||
for _, k := range p.Keywords {
|
||||
if k == "" {
|
||||
continue
|
||||
}
|
||||
seen[k] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
keywords := make([]string, 0, len(seen))
|
||||
for k := range seen {
|
||||
keywords = append(keywords, k)
|
||||
}
|
||||
sort.Strings(keywords)
|
||||
|
||||
out := make([]string, 0, len(keywords))
|
||||
for _, k := range keywords {
|
||||
out = append(out, formatQuery(source, k))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// formatQuery applies source-specific search syntax to a raw keyword.
|
||||
func formatQuery(source, keyword string) string {
|
||||
switch source {
|
||||
case "github", "gist":
|
||||
return fmt.Sprintf("%q in:file", keyword)
|
||||
default:
|
||||
// GitLab, Bitbucket, Codeberg, HuggingFace, Kaggle, Replit,
|
||||
// CodeSandbox, sandboxes, and unknown sources use bare keywords.
|
||||
return keyword
|
||||
}
|
||||
}
|
||||
124
pkg/recon/sources/queries_test.go
Normal file
124
pkg/recon/sources/queries_test.go
Normal file
@@ -0,0 +1,124 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
)
|
||||
|
||||
func testRegistry() *providers.Registry {
|
||||
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{
|
||||
Name: "openai",
|
||||
Keywords: []string{"sk-proj-", "sk-"},
|
||||
},
|
||||
{
|
||||
Name: "anthropic",
|
||||
Keywords: []string{"sk-ant-", "sk-"}, // shared "sk-" for dedup test
|
||||
},
|
||||
{
|
||||
Name: "empty-kw",
|
||||
Keywords: []string{},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func TestBuildQueries_GitHub(t *testing.T) {
|
||||
got := BuildQueries(testRegistry(), "github")
|
||||
want := []string{
|
||||
`"sk-" in:file`,
|
||||
`"sk-ant-" in:file`,
|
||||
`"sk-proj-" in:file`,
|
||||
}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("github queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_Gist(t *testing.T) {
|
||||
got := BuildQueries(testRegistry(), "gist")
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("expected 3 queries, got %d: %v", len(got), got)
|
||||
}
|
||||
if got[0] != `"sk-" in:file` {
|
||||
t.Fatalf("gist should use in:file syntax, got %q", got[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_GitLab(t *testing.T) {
|
||||
got := BuildQueries(testRegistry(), "gitlab")
|
||||
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("gitlab queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_HuggingFace(t *testing.T) {
|
||||
got := BuildQueries(testRegistry(), "huggingface")
|
||||
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("huggingface queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_UnknownSourceDefault(t *testing.T) {
|
||||
got := BuildQueries(testRegistry(), "mystery-source")
|
||||
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("default queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_NilRegistry(t *testing.T) {
|
||||
if got := BuildQueries(nil, "github"); got != nil {
|
||||
t.Fatalf("expected nil for nil registry, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_DedupAcrossProviders(t *testing.T) {
|
||||
// "sk-" is shared by openai and anthropic — should appear exactly once.
|
||||
got := BuildQueries(testRegistry(), "gitlab")
|
||||
count := 0
|
||||
for _, q := range got {
|
||||
if q == "sk-" {
|
||||
count++
|
||||
}
|
||||
}
|
||||
if count != 1 {
|
||||
t.Fatalf("expected dedup of 'sk-' to 1 occurrence, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildQueries_EmptyKeywordsSkipped(t *testing.T) {
|
||||
reg := providers.NewRegistryFromProviders([]providers.Provider{
|
||||
{Name: "only-empty", Keywords: []string{"", ""}},
|
||||
{Name: "real", Keywords: []string{"xyz"}},
|
||||
})
|
||||
got := BuildQueries(reg, "gitlab")
|
||||
want := []string{"xyz"}
|
||||
if !reflect.DeepEqual(got, want) {
|
||||
t.Fatalf("empty keywords should be skipped:\n got=%v\nwant=%v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterAll_NilEngineNoPanic(t *testing.T) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("RegisterAll(nil, ...) panicked: %v", r)
|
||||
}
|
||||
}()
|
||||
RegisterAll(nil, SourcesConfig{})
|
||||
}
|
||||
|
||||
func TestRegisterAll_EmptyCfgNoPanic(t *testing.T) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Fatalf("RegisterAll panicked with empty cfg: %v", r)
|
||||
}
|
||||
}()
|
||||
// Use a non-nil engine so the nil-guard early-return doesn't mask issues.
|
||||
// Import cycle prevention: pkg/recon is imported by register.go.
|
||||
eng := newTestEngine()
|
||||
RegisterAll(eng, SourcesConfig{})
|
||||
}
|
||||
32
pkg/recon/sources/register.go
Normal file
32
pkg/recon/sources/register.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
)
|
||||
|
||||
// SourcesConfig carries per-source credentials and shared dependencies read
|
||||
// from viper/env by cmd/recon.go. Plan 10-09 fleshes this out; for now it is a
|
||||
// placeholder struct so Wave 2 plans can depend on its shape.
|
||||
type SourcesConfig struct {
|
||||
GitHubToken string
|
||||
GitLabToken string
|
||||
BitbucketToken string
|
||||
HuggingFaceToken string
|
||||
KaggleUser string
|
||||
KaggleKey string
|
||||
|
||||
Registry *providers.Registry
|
||||
Limiters *recon.LimiterRegistry
|
||||
}
|
||||
|
||||
// RegisterAll registers every Phase 10 code-hosting source on engine.
|
||||
// Wave 2 plans append their source constructors here via additional
|
||||
// registerXxx helpers in this file. Plan 10-09 writes the final list.
|
||||
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
|
||||
if engine == nil {
|
||||
return
|
||||
}
|
||||
_ = cfg // wired up in Wave 2 + Plan 10-09
|
||||
// Populated by Plan 10-09 (after Wave 2 lands individual source files).
|
||||
}
|
||||
9
pkg/recon/sources/testhelpers_test.go
Normal file
9
pkg/recon/sources/testhelpers_test.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package sources
|
||||
|
||||
import "github.com/salvacybersec/keyhunter/pkg/recon"
|
||||
|
||||
// newTestEngine returns an empty recon.Engine for tests that need a non-nil
|
||||
// engine to exercise registration paths without touching real sources.
|
||||
func newTestEngine() *recon.Engine {
|
||||
return recon.NewEngine()
|
||||
}
|
||||
Reference in New Issue
Block a user