feat(08-01): add pkg/dorks foundation (schema, loader, registry, executor)

- Dork schema with Validate() mirroring provider YAML pattern
- go:embed loader tolerating empty definitions tree
- Registry with List/Get/Stats/ListBySource/ListByCategory
- Executor interface + Runner dispatch + ErrSourceNotImplemented
- Placeholder definitions/.gitkeep and repo-root dorks/.gitkeep
- Full unit test coverage for registry, validation, and runner dispatch
This commit is contained in:
salvacybersec
2026-04-06 00:15:32 +03:00
parent 46cf55ad37
commit fd6efbb4c2
7 changed files with 497 additions and 0 deletions

View File

75
pkg/dorks/executor.go Normal file
View File

@@ -0,0 +1,75 @@
package dorks
import (
"context"
"errors"
"fmt"
)
// ErrSourceNotImplemented is returned by Runner.Run when no executor has
// been registered for a dork's source. Most sources arrive in Phase 9-16
// (OSINT) — this phase only wires the GitHub executor live (Plan 08-05).
var ErrSourceNotImplemented = errors.New("dork source not yet implemented")
// ErrMissingAuth is returned by an Executor when the source requires
// credentials (e.g. GITHUB_TOKEN) that are not configured.
var ErrMissingAuth = errors.New("dork source requires auth credentials")
// Match is a single hit produced by an Executor. Downstream code feeds
// Snippet into the engine detection pipeline.
type Match struct {
DorkID string
Source string
URL string
Snippet string // content chunk forwarded to the detector
Path string // file path inside the source (repo, URL, etc.)
}
// Executor runs a single dork against its source backend. Implementations
// live in per-source files (executor_github.go, executor_shodan.go, ...).
// All executors except GitHub are stubs in Phase 8 and return
// ErrSourceNotImplemented.
type Executor interface {
// Source returns the dork source identifier this executor handles,
// e.g. "github". Must match one of ValidSources.
Source() string
// Execute runs the dork and returns matches. limit bounds the number
// of results to request from the backend (zero means backend default).
Execute(ctx context.Context, d Dork, limit int) ([]Match, error)
}
// Runner dispatches dorks to the correct per-source Executor.
type Runner struct {
executors map[string]Executor
}
// NewRunner returns an empty Runner. Call Register to wire in per-source
// executors. In Phase 8, only the GitHub executor is registered by
// Plan 08-05; every other source returns ErrSourceNotImplemented.
func NewRunner() *Runner {
return &Runner{executors: make(map[string]Executor)}
}
// Register installs an Executor, replacing any prior executor for the same
// source.
func (r *Runner) Register(e Executor) {
r.executors[e.Source()] = e
}
// Executor returns the registered Executor for the given source along with a
// boolean indicating whether one was found.
func (r *Runner) Executor(source string) (Executor, bool) {
e, ok := r.executors[source]
return e, ok
}
// Run locates the Executor for the dork's source and invokes it. If no
// Executor has been registered, it returns an error wrapping
// ErrSourceNotImplemented.
func (r *Runner) Run(ctx context.Context, d Dork, limit int) ([]Match, error) {
ex, ok := r.executors[d.Source]
if !ok {
return nil, fmt.Errorf("%w: %s (coming Phase 9-16)", ErrSourceNotImplemented, d.Source)
}
return ex.Execute(ctx, d, limit)
}

60
pkg/dorks/loader.go Normal file
View File

@@ -0,0 +1,60 @@
package dorks
import (
"embed"
"errors"
"fmt"
"io/fs"
"path/filepath"
"strings"
"gopkg.in/yaml.v3"
)
// definitionsFS embeds every file under pkg/dorks/definitions. The trailing
// `/*` is deliberate: it tolerates an empty tree containing only a .gitkeep
// placeholder, which is the case for this foundation plan before Wave 2
// plans drop in 150+ real dork YAML files.
//
//go:embed definitions/*
var definitionsFS embed.FS
// loadDorks walks the embedded definitions tree and returns every Dork found
// in a *.yaml file. Non-YAML files (e.g. .gitkeep) are ignored, empty trees
// return (nil, nil), and parse or validation errors are wrapped with the
// offending file path.
func loadDorks() ([]Dork, error) {
var dorks []Dork
err := fs.WalkDir(definitionsFS, "definitions", func(path string, d fs.DirEntry, err error) error {
if err != nil {
// Empty definitions directory (only .gitkeep) is valid.
if errors.Is(err, fs.ErrNotExist) {
return fs.SkipAll
}
return err
}
if d.IsDir() {
return nil
}
if !strings.EqualFold(filepath.Ext(path), ".yaml") && !strings.EqualFold(filepath.Ext(path), ".yml") {
return nil
}
data, err := definitionsFS.ReadFile(path)
if err != nil {
return fmt.Errorf("reading dork file %s: %w", path, err)
}
var dk Dork
if err := yaml.Unmarshal(data, &dk); err != nil {
return fmt.Errorf("parsing dork %s: %w", path, err)
}
if err := dk.Validate(); err != nil {
return fmt.Errorf("validating dork %s: %w", path, err)
}
dorks = append(dorks, dk)
return nil
})
if err != nil {
return nil, err
}
return dorks, nil
}

92
pkg/dorks/registry.go Normal file
View File

@@ -0,0 +1,92 @@
package dorks
import "fmt"
// Registry is the in-memory store of loaded dork definitions. It is built
// once at startup (via NewRegistry) and is safe for concurrent reads.
type Registry struct {
dorks []Dork
byID map[string]int
bySource map[string][]int
byCategory map[string][]int
}
// NewRegistry loads every embedded dork YAML file, validates them, and
// returns a ready-to-query Registry. An empty definitions tree is tolerated
// and yields an empty (but non-nil) Registry.
func NewRegistry() (*Registry, error) {
ds, err := loadDorks()
if err != nil {
return nil, fmt.Errorf("loading dorks: %w", err)
}
return NewRegistryFromDorks(ds), nil
}
// NewRegistryFromDorks builds a Registry from an explicit slice of dorks
// without touching the embedded filesystem. Intended for tests.
func NewRegistryFromDorks(ds []Dork) *Registry {
r := &Registry{
dorks: make([]Dork, len(ds)),
byID: make(map[string]int, len(ds)),
bySource: make(map[string][]int),
byCategory: make(map[string][]int),
}
copy(r.dorks, ds)
for i, d := range r.dorks {
r.byID[d.ID] = i
r.bySource[d.Source] = append(r.bySource[d.Source], i)
r.byCategory[d.Category] = append(r.byCategory[d.Category], i)
}
return r
}
// List returns all loaded dorks. The returned slice must not be mutated.
func (r *Registry) List() []Dork {
return r.dorks
}
// Get returns the dork with the given id and a boolean indicating whether it
// was found.
func (r *Registry) Get(id string) (Dork, bool) {
idx, ok := r.byID[id]
if !ok {
return Dork{}, false
}
return r.dorks[idx], true
}
// ListBySource returns every dork declared for the given source.
func (r *Registry) ListBySource(source string) []Dork {
idxs := r.bySource[source]
out := make([]Dork, 0, len(idxs))
for _, i := range idxs {
out = append(out, r.dorks[i])
}
return out
}
// ListByCategory returns every dork tagged with the given category.
func (r *Registry) ListByCategory(category string) []Dork {
idxs := r.byCategory[category]
out := make([]Dork, 0, len(idxs))
for _, i := range idxs {
out = append(out, r.dorks[i])
}
return out
}
// Stats returns aggregate counts grouped by source and category.
func (r *Registry) Stats() Stats {
s := Stats{
Total: len(r.dorks),
BySource: make(map[string]int, len(r.bySource)),
ByCategory: make(map[string]int, len(r.byCategory)),
}
for src, idxs := range r.bySource {
s.BySource[src] = len(idxs)
}
for cat, idxs := range r.byCategory {
s.ByCategory[cat] = len(idxs)
}
return s
}

190
pkg/dorks/registry_test.go Normal file
View File

@@ -0,0 +1,190 @@
package dorks_test
import (
"context"
"errors"
"testing"
"github.com/salvacybersec/keyhunter/pkg/dorks"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func fixture() []dorks.Dork {
return []dorks.Dork{
{
ID: "openai-github-envfile",
Name: "OpenAI API Key in .env files",
Source: "github",
Category: "frontier",
Query: "sk-proj- extension:env",
Description: "Finds OpenAI project keys exposed in committed .env files",
Tags: []string{"openai", "env", "tier1"},
},
{
ID: "anthropic-github-env",
Name: "Anthropic Key in .env",
Source: "github",
Category: "frontier",
Query: "sk-ant-api03- extension:env",
},
{
ID: "shodan-openai-banner",
Name: "OpenAI banner on Shodan",
Source: "shodan",
Category: "infrastructure",
Query: "product:openai",
},
}
}
func TestRegistry_LoadsAndIndexesDorks(t *testing.T) {
r := dorks.NewRegistryFromDorks(fixture())
require.NotNil(t, r)
assert.Len(t, r.List(), 3)
}
func TestRegistry_Get(t *testing.T) {
r := dorks.NewRegistryFromDorks(fixture())
d, ok := r.Get("openai-github-envfile")
require.True(t, ok)
assert.Equal(t, "github", d.Source)
assert.Equal(t, "frontier", d.Category)
assert.Equal(t, "sk-proj- extension:env", d.Query)
_, ok = r.Get("does-not-exist")
assert.False(t, ok)
}
func TestRegistry_ListBySource(t *testing.T) {
r := dorks.NewRegistryFromDorks(fixture())
gh := r.ListBySource("github")
assert.Len(t, gh, 2)
for _, d := range gh {
assert.Equal(t, "github", d.Source)
}
shodan := r.ListBySource("shodan")
assert.Len(t, shodan, 1)
assert.Empty(t, r.ListBySource("fofa"))
}
func TestRegistry_ListByCategory(t *testing.T) {
r := dorks.NewRegistryFromDorks(fixture())
frontier := r.ListByCategory("frontier")
assert.Len(t, frontier, 2)
for _, d := range frontier {
assert.Equal(t, "frontier", d.Category)
}
infra := r.ListByCategory("infrastructure")
assert.Len(t, infra, 1)
assert.Empty(t, r.ListByCategory("emerging"))
}
func TestRegistry_Stats(t *testing.T) {
r := dorks.NewRegistryFromDorks(fixture())
s := r.Stats()
assert.Equal(t, 3, s.Total)
assert.Equal(t, 2, s.BySource["github"])
assert.Equal(t, 1, s.BySource["shodan"])
assert.Equal(t, 2, s.ByCategory["frontier"])
assert.Equal(t, 1, s.ByCategory["infrastructure"])
}
func TestNewRegistry_EmptyDefinitionsTreeOK(t *testing.T) {
// The embedded definitions tree contains only .gitkeep in this plan.
// NewRegistry must tolerate that and return an empty but usable Registry.
r, err := dorks.NewRegistry()
require.NoError(t, err)
require.NotNil(t, r)
assert.GreaterOrEqual(t, len(r.List()), 0)
}
func TestDork_Validate(t *testing.T) {
tests := []struct {
name string
dork dorks.Dork
wantErr bool
}{
{
name: "valid",
dork: dorks.Dork{ID: "x", Source: "github", Query: "foo"},
wantErr: false,
},
{
name: "missing id",
dork: dorks.Dork{Source: "github", Query: "foo"},
wantErr: true,
},
{
name: "missing source",
dork: dorks.Dork{ID: "x", Query: "foo"},
wantErr: true,
},
{
name: "missing query",
dork: dorks.Dork{ID: "x", Source: "github"},
wantErr: true,
},
{
name: "unknown source",
dork: dorks.Dork{ID: "x", Source: "reddit", Query: "foo"},
wantErr: true,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
err := tc.dork.Validate()
if tc.wantErr {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}
func TestRunner_UnknownSourceReturnsErrSourceNotImplemented(t *testing.T) {
runner := dorks.NewRunner()
d := dorks.Dork{ID: "x", Source: "shodan", Category: "infrastructure", Query: "product:openai"}
_, err := runner.Run(context.Background(), d, 10)
require.Error(t, err)
assert.True(t, errors.Is(err, dorks.ErrSourceNotImplemented))
}
// stubExecutor is a test-only Executor that records invocations.
type stubExecutor struct {
source string
calls int
}
func (s *stubExecutor) Source() string { return s.source }
func (s *stubExecutor) Execute(_ context.Context, d dorks.Dork, _ int) ([]dorks.Match, error) {
s.calls++
return []dorks.Match{{DorkID: d.ID, Source: s.source, Snippet: "hit"}}, nil
}
func TestRunner_RegisterAndDispatch(t *testing.T) {
runner := dorks.NewRunner()
stub := &stubExecutor{source: "github"}
runner.Register(stub)
got, ok := runner.Executor("github")
require.True(t, ok)
assert.Equal(t, "github", got.Source())
d := dorks.Dork{ID: "t1", Source: "github", Query: "sk-"}
matches, err := runner.Run(context.Background(), d, 5)
require.NoError(t, err)
assert.Equal(t, 1, stub.calls)
require.Len(t, matches, 1)
assert.Equal(t, "t1", matches[0].DorkID)
}

80
pkg/dorks/schema.go Normal file
View File

@@ -0,0 +1,80 @@
// Package dorks provides YAML-embedded dork definitions and per-source
// executors for the KeyHunter dork engine. The package mirrors the
// pkg/providers pattern: dorks are authored as YAML files under
// pkg/dorks/definitions/{source}/*.yaml and compiled into the binary via
// go:embed, then loaded into an in-memory Registry at startup.
package dorks
import (
"fmt"
"strings"
)
// Dork is a single dork definition loaded from a YAML file under
// pkg/dorks/definitions/{source}/*.yaml.
type Dork struct {
ID string `yaml:"id"`
Name string `yaml:"name"`
Source string `yaml:"source"` // github|google|shodan|censys|zoomeye|fofa|gitlab|bing
Category string `yaml:"category"` // frontier|specialized|infrastructure|emerging|enterprise
Query string `yaml:"query"`
Description string `yaml:"description"`
Tags []string `yaml:"tags"`
}
// ValidSources enumerates the dork source backends recognised by the engine.
// Executors for most of these arrive in later phases (OSINT 9-16); the
// foundation plan only wires the GitHub executor live.
var ValidSources = []string{
"github",
"google",
"shodan",
"censys",
"zoomeye",
"fofa",
"gitlab",
"bing",
}
// ValidCategories enumerates the dork taxonomy buckets used for filtering.
var ValidCategories = []string{
"frontier",
"specialized",
"infrastructure",
"emerging",
"enterprise",
}
// Validate returns a non-nil error when the dork is missing required fields
// or declares an unknown source.
func (d Dork) Validate() error {
if strings.TrimSpace(d.ID) == "" {
return fmt.Errorf("dork: id is required")
}
if strings.TrimSpace(d.Source) == "" {
return fmt.Errorf("dork %q: source is required", d.ID)
}
if strings.TrimSpace(d.Query) == "" {
return fmt.Errorf("dork %q: query is required", d.ID)
}
if !contains(ValidSources, d.Source) {
return fmt.Errorf("dork %q: source %q is not one of %v", d.ID, d.Source, ValidSources)
}
return nil
}
// Stats holds aggregate counts over a Registry, returned by Registry.Stats.
type Stats struct {
Total int
BySource map[string]int
ByCategory map[string]int
}
func contains(haystack []string, needle string) bool {
for _, h := range haystack {
if h == needle {
return true
}
}
return false
}