feat(10-01): add provider-driven query generator and RegisterAll skeleton
- BuildQueries(reg, source) dedups keywords and formats per-source syntax - github/gist use 'keyword' in:file; others use bare keyword - SourcesConfig placeholder struct for Wave 2 plans to depend on - RegisterAll no-op stub (Plan 10-09 will fill)
This commit is contained in:
55
pkg/recon/sources/queries.go
Normal file
55
pkg/recon/sources/queries.go
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
package sources
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||||
|
)
|
||||||
|
|
||||||
|
// BuildQueries produces the search-string list a source should iterate for the
|
||||||
|
// given provider registry. Each keyword is formatted per source-specific syntax
|
||||||
|
// (e.g. GitHub's `"kw" in:file` qualifier). Keywords are deduped across
|
||||||
|
// providers and returned sorted for deterministic test output.
|
||||||
|
//
|
||||||
|
// A nil registry returns nil. An unknown source name falls back to the bare
|
||||||
|
// keyword form so new sources work safely until they register custom syntax.
|
||||||
|
func BuildQueries(reg *providers.Registry, source string) []string {
|
||||||
|
if reg == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
seen := make(map[string]struct{})
|
||||||
|
for _, p := range reg.List() {
|
||||||
|
for _, k := range p.Keywords {
|
||||||
|
if k == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[k] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
keywords := make([]string, 0, len(seen))
|
||||||
|
for k := range seen {
|
||||||
|
keywords = append(keywords, k)
|
||||||
|
}
|
||||||
|
sort.Strings(keywords)
|
||||||
|
|
||||||
|
out := make([]string, 0, len(keywords))
|
||||||
|
for _, k := range keywords {
|
||||||
|
out = append(out, formatQuery(source, k))
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatQuery applies source-specific search syntax to a raw keyword.
|
||||||
|
func formatQuery(source, keyword string) string {
|
||||||
|
switch source {
|
||||||
|
case "github", "gist":
|
||||||
|
return fmt.Sprintf("%q in:file", keyword)
|
||||||
|
default:
|
||||||
|
// GitLab, Bitbucket, Codeberg, HuggingFace, Kaggle, Replit,
|
||||||
|
// CodeSandbox, sandboxes, and unknown sources use bare keywords.
|
||||||
|
return keyword
|
||||||
|
}
|
||||||
|
}
|
||||||
124
pkg/recon/sources/queries_test.go
Normal file
124
pkg/recon/sources/queries_test.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
package sources
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||||
|
)
|
||||||
|
|
||||||
|
func testRegistry() *providers.Registry {
|
||||||
|
return providers.NewRegistryFromProviders([]providers.Provider{
|
||||||
|
{
|
||||||
|
Name: "openai",
|
||||||
|
Keywords: []string{"sk-proj-", "sk-"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "anthropic",
|
||||||
|
Keywords: []string{"sk-ant-", "sk-"}, // shared "sk-" for dedup test
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "empty-kw",
|
||||||
|
Keywords: []string{},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_GitHub(t *testing.T) {
|
||||||
|
got := BuildQueries(testRegistry(), "github")
|
||||||
|
want := []string{
|
||||||
|
`"sk-" in:file`,
|
||||||
|
`"sk-ant-" in:file`,
|
||||||
|
`"sk-proj-" in:file`,
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("github queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_Gist(t *testing.T) {
|
||||||
|
got := BuildQueries(testRegistry(), "gist")
|
||||||
|
if len(got) != 3 {
|
||||||
|
t.Fatalf("expected 3 queries, got %d: %v", len(got), got)
|
||||||
|
}
|
||||||
|
if got[0] != `"sk-" in:file` {
|
||||||
|
t.Fatalf("gist should use in:file syntax, got %q", got[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_GitLab(t *testing.T) {
|
||||||
|
got := BuildQueries(testRegistry(), "gitlab")
|
||||||
|
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("gitlab queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_HuggingFace(t *testing.T) {
|
||||||
|
got := BuildQueries(testRegistry(), "huggingface")
|
||||||
|
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("huggingface queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_UnknownSourceDefault(t *testing.T) {
|
||||||
|
got := BuildQueries(testRegistry(), "mystery-source")
|
||||||
|
want := []string{"sk-", "sk-ant-", "sk-proj-"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("default queries mismatch:\n got=%v\nwant=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_NilRegistry(t *testing.T) {
|
||||||
|
if got := BuildQueries(nil, "github"); got != nil {
|
||||||
|
t.Fatalf("expected nil for nil registry, got %v", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_DedupAcrossProviders(t *testing.T) {
|
||||||
|
// "sk-" is shared by openai and anthropic — should appear exactly once.
|
||||||
|
got := BuildQueries(testRegistry(), "gitlab")
|
||||||
|
count := 0
|
||||||
|
for _, q := range got {
|
||||||
|
if q == "sk-" {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if count != 1 {
|
||||||
|
t.Fatalf("expected dedup of 'sk-' to 1 occurrence, got %d", count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildQueries_EmptyKeywordsSkipped(t *testing.T) {
|
||||||
|
reg := providers.NewRegistryFromProviders([]providers.Provider{
|
||||||
|
{Name: "only-empty", Keywords: []string{"", ""}},
|
||||||
|
{Name: "real", Keywords: []string{"xyz"}},
|
||||||
|
})
|
||||||
|
got := BuildQueries(reg, "gitlab")
|
||||||
|
want := []string{"xyz"}
|
||||||
|
if !reflect.DeepEqual(got, want) {
|
||||||
|
t.Fatalf("empty keywords should be skipped:\n got=%v\nwant=%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRegisterAll_NilEngineNoPanic(t *testing.T) {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
t.Fatalf("RegisterAll(nil, ...) panicked: %v", r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
RegisterAll(nil, SourcesConfig{})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRegisterAll_EmptyCfgNoPanic(t *testing.T) {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
t.Fatalf("RegisterAll panicked with empty cfg: %v", r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
// Use a non-nil engine so the nil-guard early-return doesn't mask issues.
|
||||||
|
// Import cycle prevention: pkg/recon is imported by register.go.
|
||||||
|
eng := newTestEngine()
|
||||||
|
RegisterAll(eng, SourcesConfig{})
|
||||||
|
}
|
||||||
32
pkg/recon/sources/register.go
Normal file
32
pkg/recon/sources/register.go
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
package sources
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/recon"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SourcesConfig carries per-source credentials and shared dependencies read
|
||||||
|
// from viper/env by cmd/recon.go. Plan 10-09 fleshes this out; for now it is a
|
||||||
|
// placeholder struct so Wave 2 plans can depend on its shape.
|
||||||
|
type SourcesConfig struct {
|
||||||
|
GitHubToken string
|
||||||
|
GitLabToken string
|
||||||
|
BitbucketToken string
|
||||||
|
HuggingFaceToken string
|
||||||
|
KaggleUser string
|
||||||
|
KaggleKey string
|
||||||
|
|
||||||
|
Registry *providers.Registry
|
||||||
|
Limiters *recon.LimiterRegistry
|
||||||
|
}
|
||||||
|
|
||||||
|
// RegisterAll registers every Phase 10 code-hosting source on engine.
|
||||||
|
// Wave 2 plans append their source constructors here via additional
|
||||||
|
// registerXxx helpers in this file. Plan 10-09 writes the final list.
|
||||||
|
func RegisterAll(engine *recon.Engine, cfg SourcesConfig) {
|
||||||
|
if engine == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = cfg // wired up in Wave 2 + Plan 10-09
|
||||||
|
// Populated by Plan 10-09 (after Wave 2 lands individual source files).
|
||||||
|
}
|
||||||
9
pkg/recon/sources/testhelpers_test.go
Normal file
9
pkg/recon/sources/testhelpers_test.go
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package sources
|
||||||
|
|
||||||
|
import "github.com/salvacybersec/keyhunter/pkg/recon"
|
||||||
|
|
||||||
|
// newTestEngine returns an empty recon.Engine for tests that need a non-nil
|
||||||
|
// engine to exercise registration paths without touching real sources.
|
||||||
|
func newTestEngine() *recon.Engine {
|
||||||
|
return recon.NewEngine()
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user