diff --git a/dorks/.gitkeep b/dorks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pkg/dorks/definitions/.gitkeep b/pkg/dorks/definitions/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pkg/dorks/executor.go b/pkg/dorks/executor.go new file mode 100644 index 0000000..7c87855 --- /dev/null +++ b/pkg/dorks/executor.go @@ -0,0 +1,75 @@ +package dorks + +import ( + "context" + "errors" + "fmt" +) + +// ErrSourceNotImplemented is returned by Runner.Run when no executor has +// been registered for a dork's source. Most sources arrive in Phase 9-16 +// (OSINT) — this phase only wires the GitHub executor live (Plan 08-05). +var ErrSourceNotImplemented = errors.New("dork source not yet implemented") + +// ErrMissingAuth is returned by an Executor when the source requires +// credentials (e.g. GITHUB_TOKEN) that are not configured. +var ErrMissingAuth = errors.New("dork source requires auth credentials") + +// Match is a single hit produced by an Executor. Downstream code feeds +// Snippet into the engine detection pipeline. +type Match struct { + DorkID string + Source string + URL string + Snippet string // content chunk forwarded to the detector + Path string // file path inside the source (repo, URL, etc.) +} + +// Executor runs a single dork against its source backend. Implementations +// live in per-source files (executor_github.go, executor_shodan.go, ...). +// All executors except GitHub are stubs in Phase 8 and return +// ErrSourceNotImplemented. +type Executor interface { + // Source returns the dork source identifier this executor handles, + // e.g. "github". Must match one of ValidSources. + Source() string + // Execute runs the dork and returns matches. limit bounds the number + // of results to request from the backend (zero means backend default). + Execute(ctx context.Context, d Dork, limit int) ([]Match, error) +} + +// Runner dispatches dorks to the correct per-source Executor. +type Runner struct { + executors map[string]Executor +} + +// NewRunner returns an empty Runner. Call Register to wire in per-source +// executors. In Phase 8, only the GitHub executor is registered by +// Plan 08-05; every other source returns ErrSourceNotImplemented. +func NewRunner() *Runner { + return &Runner{executors: make(map[string]Executor)} +} + +// Register installs an Executor, replacing any prior executor for the same +// source. +func (r *Runner) Register(e Executor) { + r.executors[e.Source()] = e +} + +// Executor returns the registered Executor for the given source along with a +// boolean indicating whether one was found. +func (r *Runner) Executor(source string) (Executor, bool) { + e, ok := r.executors[source] + return e, ok +} + +// Run locates the Executor for the dork's source and invokes it. If no +// Executor has been registered, it returns an error wrapping +// ErrSourceNotImplemented. +func (r *Runner) Run(ctx context.Context, d Dork, limit int) ([]Match, error) { + ex, ok := r.executors[d.Source] + if !ok { + return nil, fmt.Errorf("%w: %s (coming Phase 9-16)", ErrSourceNotImplemented, d.Source) + } + return ex.Execute(ctx, d, limit) +} diff --git a/pkg/dorks/loader.go b/pkg/dorks/loader.go new file mode 100644 index 0000000..c28b3f4 --- /dev/null +++ b/pkg/dorks/loader.go @@ -0,0 +1,60 @@ +package dorks + +import ( + "embed" + "errors" + "fmt" + "io/fs" + "path/filepath" + "strings" + + "gopkg.in/yaml.v3" +) + +// definitionsFS embeds every file under pkg/dorks/definitions. The trailing +// `/*` is deliberate: it tolerates an empty tree containing only a .gitkeep +// placeholder, which is the case for this foundation plan before Wave 2 +// plans drop in 150+ real dork YAML files. +// +//go:embed definitions/* +var definitionsFS embed.FS + +// loadDorks walks the embedded definitions tree and returns every Dork found +// in a *.yaml file. Non-YAML files (e.g. .gitkeep) are ignored, empty trees +// return (nil, nil), and parse or validation errors are wrapped with the +// offending file path. +func loadDorks() ([]Dork, error) { + var dorks []Dork + err := fs.WalkDir(definitionsFS, "definitions", func(path string, d fs.DirEntry, err error) error { + if err != nil { + // Empty definitions directory (only .gitkeep) is valid. + if errors.Is(err, fs.ErrNotExist) { + return fs.SkipAll + } + return err + } + if d.IsDir() { + return nil + } + if !strings.EqualFold(filepath.Ext(path), ".yaml") && !strings.EqualFold(filepath.Ext(path), ".yml") { + return nil + } + data, err := definitionsFS.ReadFile(path) + if err != nil { + return fmt.Errorf("reading dork file %s: %w", path, err) + } + var dk Dork + if err := yaml.Unmarshal(data, &dk); err != nil { + return fmt.Errorf("parsing dork %s: %w", path, err) + } + if err := dk.Validate(); err != nil { + return fmt.Errorf("validating dork %s: %w", path, err) + } + dorks = append(dorks, dk) + return nil + }) + if err != nil { + return nil, err + } + return dorks, nil +} diff --git a/pkg/dorks/registry.go b/pkg/dorks/registry.go new file mode 100644 index 0000000..5fed588 --- /dev/null +++ b/pkg/dorks/registry.go @@ -0,0 +1,92 @@ +package dorks + +import "fmt" + +// Registry is the in-memory store of loaded dork definitions. It is built +// once at startup (via NewRegistry) and is safe for concurrent reads. +type Registry struct { + dorks []Dork + byID map[string]int + bySource map[string][]int + byCategory map[string][]int +} + +// NewRegistry loads every embedded dork YAML file, validates them, and +// returns a ready-to-query Registry. An empty definitions tree is tolerated +// and yields an empty (but non-nil) Registry. +func NewRegistry() (*Registry, error) { + ds, err := loadDorks() + if err != nil { + return nil, fmt.Errorf("loading dorks: %w", err) + } + return NewRegistryFromDorks(ds), nil +} + +// NewRegistryFromDorks builds a Registry from an explicit slice of dorks +// without touching the embedded filesystem. Intended for tests. +func NewRegistryFromDorks(ds []Dork) *Registry { + r := &Registry{ + dorks: make([]Dork, len(ds)), + byID: make(map[string]int, len(ds)), + bySource: make(map[string][]int), + byCategory: make(map[string][]int), + } + copy(r.dorks, ds) + for i, d := range r.dorks { + r.byID[d.ID] = i + r.bySource[d.Source] = append(r.bySource[d.Source], i) + r.byCategory[d.Category] = append(r.byCategory[d.Category], i) + } + return r +} + +// List returns all loaded dorks. The returned slice must not be mutated. +func (r *Registry) List() []Dork { + return r.dorks +} + +// Get returns the dork with the given id and a boolean indicating whether it +// was found. +func (r *Registry) Get(id string) (Dork, bool) { + idx, ok := r.byID[id] + if !ok { + return Dork{}, false + } + return r.dorks[idx], true +} + +// ListBySource returns every dork declared for the given source. +func (r *Registry) ListBySource(source string) []Dork { + idxs := r.bySource[source] + out := make([]Dork, 0, len(idxs)) + for _, i := range idxs { + out = append(out, r.dorks[i]) + } + return out +} + +// ListByCategory returns every dork tagged with the given category. +func (r *Registry) ListByCategory(category string) []Dork { + idxs := r.byCategory[category] + out := make([]Dork, 0, len(idxs)) + for _, i := range idxs { + out = append(out, r.dorks[i]) + } + return out +} + +// Stats returns aggregate counts grouped by source and category. +func (r *Registry) Stats() Stats { + s := Stats{ + Total: len(r.dorks), + BySource: make(map[string]int, len(r.bySource)), + ByCategory: make(map[string]int, len(r.byCategory)), + } + for src, idxs := range r.bySource { + s.BySource[src] = len(idxs) + } + for cat, idxs := range r.byCategory { + s.ByCategory[cat] = len(idxs) + } + return s +} diff --git a/pkg/dorks/registry_test.go b/pkg/dorks/registry_test.go new file mode 100644 index 0000000..b484006 --- /dev/null +++ b/pkg/dorks/registry_test.go @@ -0,0 +1,190 @@ +package dorks_test + +import ( + "context" + "errors" + "testing" + + "github.com/salvacybersec/keyhunter/pkg/dorks" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func fixture() []dorks.Dork { + return []dorks.Dork{ + { + ID: "openai-github-envfile", + Name: "OpenAI API Key in .env files", + Source: "github", + Category: "frontier", + Query: "sk-proj- extension:env", + Description: "Finds OpenAI project keys exposed in committed .env files", + Tags: []string{"openai", "env", "tier1"}, + }, + { + ID: "anthropic-github-env", + Name: "Anthropic Key in .env", + Source: "github", + Category: "frontier", + Query: "sk-ant-api03- extension:env", + }, + { + ID: "shodan-openai-banner", + Name: "OpenAI banner on Shodan", + Source: "shodan", + Category: "infrastructure", + Query: "product:openai", + }, + } +} + +func TestRegistry_LoadsAndIndexesDorks(t *testing.T) { + r := dorks.NewRegistryFromDorks(fixture()) + require.NotNil(t, r) + assert.Len(t, r.List(), 3) +} + +func TestRegistry_Get(t *testing.T) { + r := dorks.NewRegistryFromDorks(fixture()) + + d, ok := r.Get("openai-github-envfile") + require.True(t, ok) + assert.Equal(t, "github", d.Source) + assert.Equal(t, "frontier", d.Category) + assert.Equal(t, "sk-proj- extension:env", d.Query) + + _, ok = r.Get("does-not-exist") + assert.False(t, ok) +} + +func TestRegistry_ListBySource(t *testing.T) { + r := dorks.NewRegistryFromDorks(fixture()) + + gh := r.ListBySource("github") + assert.Len(t, gh, 2) + for _, d := range gh { + assert.Equal(t, "github", d.Source) + } + + shodan := r.ListBySource("shodan") + assert.Len(t, shodan, 1) + + assert.Empty(t, r.ListBySource("fofa")) +} + +func TestRegistry_ListByCategory(t *testing.T) { + r := dorks.NewRegistryFromDorks(fixture()) + + frontier := r.ListByCategory("frontier") + assert.Len(t, frontier, 2) + for _, d := range frontier { + assert.Equal(t, "frontier", d.Category) + } + + infra := r.ListByCategory("infrastructure") + assert.Len(t, infra, 1) + + assert.Empty(t, r.ListByCategory("emerging")) +} + +func TestRegistry_Stats(t *testing.T) { + r := dorks.NewRegistryFromDorks(fixture()) + + s := r.Stats() + assert.Equal(t, 3, s.Total) + assert.Equal(t, 2, s.BySource["github"]) + assert.Equal(t, 1, s.BySource["shodan"]) + assert.Equal(t, 2, s.ByCategory["frontier"]) + assert.Equal(t, 1, s.ByCategory["infrastructure"]) +} + +func TestNewRegistry_EmptyDefinitionsTreeOK(t *testing.T) { + // The embedded definitions tree contains only .gitkeep in this plan. + // NewRegistry must tolerate that and return an empty but usable Registry. + r, err := dorks.NewRegistry() + require.NoError(t, err) + require.NotNil(t, r) + assert.GreaterOrEqual(t, len(r.List()), 0) +} + +func TestDork_Validate(t *testing.T) { + tests := []struct { + name string + dork dorks.Dork + wantErr bool + }{ + { + name: "valid", + dork: dorks.Dork{ID: "x", Source: "github", Query: "foo"}, + wantErr: false, + }, + { + name: "missing id", + dork: dorks.Dork{Source: "github", Query: "foo"}, + wantErr: true, + }, + { + name: "missing source", + dork: dorks.Dork{ID: "x", Query: "foo"}, + wantErr: true, + }, + { + name: "missing query", + dork: dorks.Dork{ID: "x", Source: "github"}, + wantErr: true, + }, + { + name: "unknown source", + dork: dorks.Dork{ID: "x", Source: "reddit", Query: "foo"}, + wantErr: true, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := tc.dork.Validate() + if tc.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } +} + +func TestRunner_UnknownSourceReturnsErrSourceNotImplemented(t *testing.T) { + runner := dorks.NewRunner() + d := dorks.Dork{ID: "x", Source: "shodan", Category: "infrastructure", Query: "product:openai"} + + _, err := runner.Run(context.Background(), d, 10) + require.Error(t, err) + assert.True(t, errors.Is(err, dorks.ErrSourceNotImplemented)) +} + +// stubExecutor is a test-only Executor that records invocations. +type stubExecutor struct { + source string + calls int +} + +func (s *stubExecutor) Source() string { return s.source } +func (s *stubExecutor) Execute(_ context.Context, d dorks.Dork, _ int) ([]dorks.Match, error) { + s.calls++ + return []dorks.Match{{DorkID: d.ID, Source: s.source, Snippet: "hit"}}, nil +} + +func TestRunner_RegisterAndDispatch(t *testing.T) { + runner := dorks.NewRunner() + stub := &stubExecutor{source: "github"} + runner.Register(stub) + + got, ok := runner.Executor("github") + require.True(t, ok) + assert.Equal(t, "github", got.Source()) + + d := dorks.Dork{ID: "t1", Source: "github", Query: "sk-"} + matches, err := runner.Run(context.Background(), d, 5) + require.NoError(t, err) + assert.Equal(t, 1, stub.calls) + require.Len(t, matches, 1) + assert.Equal(t, "t1", matches[0].DorkID) +} diff --git a/pkg/dorks/schema.go b/pkg/dorks/schema.go new file mode 100644 index 0000000..3c1ab9c --- /dev/null +++ b/pkg/dorks/schema.go @@ -0,0 +1,80 @@ +// Package dorks provides YAML-embedded dork definitions and per-source +// executors for the KeyHunter dork engine. The package mirrors the +// pkg/providers pattern: dorks are authored as YAML files under +// pkg/dorks/definitions/{source}/*.yaml and compiled into the binary via +// go:embed, then loaded into an in-memory Registry at startup. +package dorks + +import ( + "fmt" + "strings" +) + +// Dork is a single dork definition loaded from a YAML file under +// pkg/dorks/definitions/{source}/*.yaml. +type Dork struct { + ID string `yaml:"id"` + Name string `yaml:"name"` + Source string `yaml:"source"` // github|google|shodan|censys|zoomeye|fofa|gitlab|bing + Category string `yaml:"category"` // frontier|specialized|infrastructure|emerging|enterprise + Query string `yaml:"query"` + Description string `yaml:"description"` + Tags []string `yaml:"tags"` +} + +// ValidSources enumerates the dork source backends recognised by the engine. +// Executors for most of these arrive in later phases (OSINT 9-16); the +// foundation plan only wires the GitHub executor live. +var ValidSources = []string{ + "github", + "google", + "shodan", + "censys", + "zoomeye", + "fofa", + "gitlab", + "bing", +} + +// ValidCategories enumerates the dork taxonomy buckets used for filtering. +var ValidCategories = []string{ + "frontier", + "specialized", + "infrastructure", + "emerging", + "enterprise", +} + +// Validate returns a non-nil error when the dork is missing required fields +// or declares an unknown source. +func (d Dork) Validate() error { + if strings.TrimSpace(d.ID) == "" { + return fmt.Errorf("dork: id is required") + } + if strings.TrimSpace(d.Source) == "" { + return fmt.Errorf("dork %q: source is required", d.ID) + } + if strings.TrimSpace(d.Query) == "" { + return fmt.Errorf("dork %q: query is required", d.ID) + } + if !contains(ValidSources, d.Source) { + return fmt.Errorf("dork %q: source %q is not one of %v", d.ID, d.Source, ValidSources) + } + return nil +} + +// Stats holds aggregate counts over a Registry, returned by Registry.Stats. +type Stats struct { + Total int + BySource map[string]int + ByCategory map[string]int +} + +func contains(haystack []string, needle string) bool { + for _, h := range haystack { + if h == needle { + return true + } + } + return false +}