feat(08-01): add pkg/dorks foundation (schema, loader, registry, executor)
- Dork schema with Validate() mirroring provider YAML pattern - go:embed loader tolerating empty definitions tree - Registry with List/Get/Stats/ListBySource/ListByCategory - Executor interface + Runner dispatch + ErrSourceNotImplemented - Placeholder definitions/.gitkeep and repo-root dorks/.gitkeep - Full unit test coverage for registry, validation, and runner dispatch
This commit is contained in:
0
dorks/.gitkeep
Normal file
0
dorks/.gitkeep
Normal file
0
pkg/dorks/definitions/.gitkeep
Normal file
0
pkg/dorks/definitions/.gitkeep
Normal file
75
pkg/dorks/executor.go
Normal file
75
pkg/dorks/executor.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package dorks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
// ErrSourceNotImplemented is returned by Runner.Run when no executor has
|
||||
// been registered for a dork's source. Most sources arrive in Phase 9-16
|
||||
// (OSINT) — this phase only wires the GitHub executor live (Plan 08-05).
|
||||
var ErrSourceNotImplemented = errors.New("dork source not yet implemented")
|
||||
|
||||
// ErrMissingAuth is returned by an Executor when the source requires
|
||||
// credentials (e.g. GITHUB_TOKEN) that are not configured.
|
||||
var ErrMissingAuth = errors.New("dork source requires auth credentials")
|
||||
|
||||
// Match is a single hit produced by an Executor. Downstream code feeds
|
||||
// Snippet into the engine detection pipeline.
|
||||
type Match struct {
|
||||
DorkID string
|
||||
Source string
|
||||
URL string
|
||||
Snippet string // content chunk forwarded to the detector
|
||||
Path string // file path inside the source (repo, URL, etc.)
|
||||
}
|
||||
|
||||
// Executor runs a single dork against its source backend. Implementations
|
||||
// live in per-source files (executor_github.go, executor_shodan.go, ...).
|
||||
// All executors except GitHub are stubs in Phase 8 and return
|
||||
// ErrSourceNotImplemented.
|
||||
type Executor interface {
|
||||
// Source returns the dork source identifier this executor handles,
|
||||
// e.g. "github". Must match one of ValidSources.
|
||||
Source() string
|
||||
// Execute runs the dork and returns matches. limit bounds the number
|
||||
// of results to request from the backend (zero means backend default).
|
||||
Execute(ctx context.Context, d Dork, limit int) ([]Match, error)
|
||||
}
|
||||
|
||||
// Runner dispatches dorks to the correct per-source Executor.
|
||||
type Runner struct {
|
||||
executors map[string]Executor
|
||||
}
|
||||
|
||||
// NewRunner returns an empty Runner. Call Register to wire in per-source
|
||||
// executors. In Phase 8, only the GitHub executor is registered by
|
||||
// Plan 08-05; every other source returns ErrSourceNotImplemented.
|
||||
func NewRunner() *Runner {
|
||||
return &Runner{executors: make(map[string]Executor)}
|
||||
}
|
||||
|
||||
// Register installs an Executor, replacing any prior executor for the same
|
||||
// source.
|
||||
func (r *Runner) Register(e Executor) {
|
||||
r.executors[e.Source()] = e
|
||||
}
|
||||
|
||||
// Executor returns the registered Executor for the given source along with a
|
||||
// boolean indicating whether one was found.
|
||||
func (r *Runner) Executor(source string) (Executor, bool) {
|
||||
e, ok := r.executors[source]
|
||||
return e, ok
|
||||
}
|
||||
|
||||
// Run locates the Executor for the dork's source and invokes it. If no
|
||||
// Executor has been registered, it returns an error wrapping
|
||||
// ErrSourceNotImplemented.
|
||||
func (r *Runner) Run(ctx context.Context, d Dork, limit int) ([]Match, error) {
|
||||
ex, ok := r.executors[d.Source]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("%w: %s (coming Phase 9-16)", ErrSourceNotImplemented, d.Source)
|
||||
}
|
||||
return ex.Execute(ctx, d, limit)
|
||||
}
|
||||
60
pkg/dorks/loader.go
Normal file
60
pkg/dorks/loader.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package dorks
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// definitionsFS embeds every file under pkg/dorks/definitions. The trailing
|
||||
// `/*` is deliberate: it tolerates an empty tree containing only a .gitkeep
|
||||
// placeholder, which is the case for this foundation plan before Wave 2
|
||||
// plans drop in 150+ real dork YAML files.
|
||||
//
|
||||
//go:embed definitions/*
|
||||
var definitionsFS embed.FS
|
||||
|
||||
// loadDorks walks the embedded definitions tree and returns every Dork found
|
||||
// in a *.yaml file. Non-YAML files (e.g. .gitkeep) are ignored, empty trees
|
||||
// return (nil, nil), and parse or validation errors are wrapped with the
|
||||
// offending file path.
|
||||
func loadDorks() ([]Dork, error) {
|
||||
var dorks []Dork
|
||||
err := fs.WalkDir(definitionsFS, "definitions", func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
// Empty definitions directory (only .gitkeep) is valid.
|
||||
if errors.Is(err, fs.ErrNotExist) {
|
||||
return fs.SkipAll
|
||||
}
|
||||
return err
|
||||
}
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
if !strings.EqualFold(filepath.Ext(path), ".yaml") && !strings.EqualFold(filepath.Ext(path), ".yml") {
|
||||
return nil
|
||||
}
|
||||
data, err := definitionsFS.ReadFile(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("reading dork file %s: %w", path, err)
|
||||
}
|
||||
var dk Dork
|
||||
if err := yaml.Unmarshal(data, &dk); err != nil {
|
||||
return fmt.Errorf("parsing dork %s: %w", path, err)
|
||||
}
|
||||
if err := dk.Validate(); err != nil {
|
||||
return fmt.Errorf("validating dork %s: %w", path, err)
|
||||
}
|
||||
dorks = append(dorks, dk)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return dorks, nil
|
||||
}
|
||||
92
pkg/dorks/registry.go
Normal file
92
pkg/dorks/registry.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package dorks
|
||||
|
||||
import "fmt"
|
||||
|
||||
// Registry is the in-memory store of loaded dork definitions. It is built
|
||||
// once at startup (via NewRegistry) and is safe for concurrent reads.
|
||||
type Registry struct {
|
||||
dorks []Dork
|
||||
byID map[string]int
|
||||
bySource map[string][]int
|
||||
byCategory map[string][]int
|
||||
}
|
||||
|
||||
// NewRegistry loads every embedded dork YAML file, validates them, and
|
||||
// returns a ready-to-query Registry. An empty definitions tree is tolerated
|
||||
// and yields an empty (but non-nil) Registry.
|
||||
func NewRegistry() (*Registry, error) {
|
||||
ds, err := loadDorks()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("loading dorks: %w", err)
|
||||
}
|
||||
return NewRegistryFromDorks(ds), nil
|
||||
}
|
||||
|
||||
// NewRegistryFromDorks builds a Registry from an explicit slice of dorks
|
||||
// without touching the embedded filesystem. Intended for tests.
|
||||
func NewRegistryFromDorks(ds []Dork) *Registry {
|
||||
r := &Registry{
|
||||
dorks: make([]Dork, len(ds)),
|
||||
byID: make(map[string]int, len(ds)),
|
||||
bySource: make(map[string][]int),
|
||||
byCategory: make(map[string][]int),
|
||||
}
|
||||
copy(r.dorks, ds)
|
||||
for i, d := range r.dorks {
|
||||
r.byID[d.ID] = i
|
||||
r.bySource[d.Source] = append(r.bySource[d.Source], i)
|
||||
r.byCategory[d.Category] = append(r.byCategory[d.Category], i)
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
// List returns all loaded dorks. The returned slice must not be mutated.
|
||||
func (r *Registry) List() []Dork {
|
||||
return r.dorks
|
||||
}
|
||||
|
||||
// Get returns the dork with the given id and a boolean indicating whether it
|
||||
// was found.
|
||||
func (r *Registry) Get(id string) (Dork, bool) {
|
||||
idx, ok := r.byID[id]
|
||||
if !ok {
|
||||
return Dork{}, false
|
||||
}
|
||||
return r.dorks[idx], true
|
||||
}
|
||||
|
||||
// ListBySource returns every dork declared for the given source.
|
||||
func (r *Registry) ListBySource(source string) []Dork {
|
||||
idxs := r.bySource[source]
|
||||
out := make([]Dork, 0, len(idxs))
|
||||
for _, i := range idxs {
|
||||
out = append(out, r.dorks[i])
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ListByCategory returns every dork tagged with the given category.
|
||||
func (r *Registry) ListByCategory(category string) []Dork {
|
||||
idxs := r.byCategory[category]
|
||||
out := make([]Dork, 0, len(idxs))
|
||||
for _, i := range idxs {
|
||||
out = append(out, r.dorks[i])
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Stats returns aggregate counts grouped by source and category.
|
||||
func (r *Registry) Stats() Stats {
|
||||
s := Stats{
|
||||
Total: len(r.dorks),
|
||||
BySource: make(map[string]int, len(r.bySource)),
|
||||
ByCategory: make(map[string]int, len(r.byCategory)),
|
||||
}
|
||||
for src, idxs := range r.bySource {
|
||||
s.BySource[src] = len(idxs)
|
||||
}
|
||||
for cat, idxs := range r.byCategory {
|
||||
s.ByCategory[cat] = len(idxs)
|
||||
}
|
||||
return s
|
||||
}
|
||||
190
pkg/dorks/registry_test.go
Normal file
190
pkg/dorks/registry_test.go
Normal file
@@ -0,0 +1,190 @@
|
||||
package dorks_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/dorks"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func fixture() []dorks.Dork {
|
||||
return []dorks.Dork{
|
||||
{
|
||||
ID: "openai-github-envfile",
|
||||
Name: "OpenAI API Key in .env files",
|
||||
Source: "github",
|
||||
Category: "frontier",
|
||||
Query: "sk-proj- extension:env",
|
||||
Description: "Finds OpenAI project keys exposed in committed .env files",
|
||||
Tags: []string{"openai", "env", "tier1"},
|
||||
},
|
||||
{
|
||||
ID: "anthropic-github-env",
|
||||
Name: "Anthropic Key in .env",
|
||||
Source: "github",
|
||||
Category: "frontier",
|
||||
Query: "sk-ant-api03- extension:env",
|
||||
},
|
||||
{
|
||||
ID: "shodan-openai-banner",
|
||||
Name: "OpenAI banner on Shodan",
|
||||
Source: "shodan",
|
||||
Category: "infrastructure",
|
||||
Query: "product:openai",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegistry_LoadsAndIndexesDorks(t *testing.T) {
|
||||
r := dorks.NewRegistryFromDorks(fixture())
|
||||
require.NotNil(t, r)
|
||||
assert.Len(t, r.List(), 3)
|
||||
}
|
||||
|
||||
func TestRegistry_Get(t *testing.T) {
|
||||
r := dorks.NewRegistryFromDorks(fixture())
|
||||
|
||||
d, ok := r.Get("openai-github-envfile")
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "github", d.Source)
|
||||
assert.Equal(t, "frontier", d.Category)
|
||||
assert.Equal(t, "sk-proj- extension:env", d.Query)
|
||||
|
||||
_, ok = r.Get("does-not-exist")
|
||||
assert.False(t, ok)
|
||||
}
|
||||
|
||||
func TestRegistry_ListBySource(t *testing.T) {
|
||||
r := dorks.NewRegistryFromDorks(fixture())
|
||||
|
||||
gh := r.ListBySource("github")
|
||||
assert.Len(t, gh, 2)
|
||||
for _, d := range gh {
|
||||
assert.Equal(t, "github", d.Source)
|
||||
}
|
||||
|
||||
shodan := r.ListBySource("shodan")
|
||||
assert.Len(t, shodan, 1)
|
||||
|
||||
assert.Empty(t, r.ListBySource("fofa"))
|
||||
}
|
||||
|
||||
func TestRegistry_ListByCategory(t *testing.T) {
|
||||
r := dorks.NewRegistryFromDorks(fixture())
|
||||
|
||||
frontier := r.ListByCategory("frontier")
|
||||
assert.Len(t, frontier, 2)
|
||||
for _, d := range frontier {
|
||||
assert.Equal(t, "frontier", d.Category)
|
||||
}
|
||||
|
||||
infra := r.ListByCategory("infrastructure")
|
||||
assert.Len(t, infra, 1)
|
||||
|
||||
assert.Empty(t, r.ListByCategory("emerging"))
|
||||
}
|
||||
|
||||
func TestRegistry_Stats(t *testing.T) {
|
||||
r := dorks.NewRegistryFromDorks(fixture())
|
||||
|
||||
s := r.Stats()
|
||||
assert.Equal(t, 3, s.Total)
|
||||
assert.Equal(t, 2, s.BySource["github"])
|
||||
assert.Equal(t, 1, s.BySource["shodan"])
|
||||
assert.Equal(t, 2, s.ByCategory["frontier"])
|
||||
assert.Equal(t, 1, s.ByCategory["infrastructure"])
|
||||
}
|
||||
|
||||
func TestNewRegistry_EmptyDefinitionsTreeOK(t *testing.T) {
|
||||
// The embedded definitions tree contains only .gitkeep in this plan.
|
||||
// NewRegistry must tolerate that and return an empty but usable Registry.
|
||||
r, err := dorks.NewRegistry()
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, r)
|
||||
assert.GreaterOrEqual(t, len(r.List()), 0)
|
||||
}
|
||||
|
||||
func TestDork_Validate(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
dork dorks.Dork
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "valid",
|
||||
dork: dorks.Dork{ID: "x", Source: "github", Query: "foo"},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "missing id",
|
||||
dork: dorks.Dork{Source: "github", Query: "foo"},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "missing source",
|
||||
dork: dorks.Dork{ID: "x", Query: "foo"},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "missing query",
|
||||
dork: dorks.Dork{ID: "x", Source: "github"},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "unknown source",
|
||||
dork: dorks.Dork{ID: "x", Source: "reddit", Query: "foo"},
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := tc.dork.Validate()
|
||||
if tc.wantErr {
|
||||
assert.Error(t, err)
|
||||
} else {
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunner_UnknownSourceReturnsErrSourceNotImplemented(t *testing.T) {
|
||||
runner := dorks.NewRunner()
|
||||
d := dorks.Dork{ID: "x", Source: "shodan", Category: "infrastructure", Query: "product:openai"}
|
||||
|
||||
_, err := runner.Run(context.Background(), d, 10)
|
||||
require.Error(t, err)
|
||||
assert.True(t, errors.Is(err, dorks.ErrSourceNotImplemented))
|
||||
}
|
||||
|
||||
// stubExecutor is a test-only Executor that records invocations.
|
||||
type stubExecutor struct {
|
||||
source string
|
||||
calls int
|
||||
}
|
||||
|
||||
func (s *stubExecutor) Source() string { return s.source }
|
||||
func (s *stubExecutor) Execute(_ context.Context, d dorks.Dork, _ int) ([]dorks.Match, error) {
|
||||
s.calls++
|
||||
return []dorks.Match{{DorkID: d.ID, Source: s.source, Snippet: "hit"}}, nil
|
||||
}
|
||||
|
||||
func TestRunner_RegisterAndDispatch(t *testing.T) {
|
||||
runner := dorks.NewRunner()
|
||||
stub := &stubExecutor{source: "github"}
|
||||
runner.Register(stub)
|
||||
|
||||
got, ok := runner.Executor("github")
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, "github", got.Source())
|
||||
|
||||
d := dorks.Dork{ID: "t1", Source: "github", Query: "sk-"}
|
||||
matches, err := runner.Run(context.Background(), d, 5)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, 1, stub.calls)
|
||||
require.Len(t, matches, 1)
|
||||
assert.Equal(t, "t1", matches[0].DorkID)
|
||||
}
|
||||
80
pkg/dorks/schema.go
Normal file
80
pkg/dorks/schema.go
Normal file
@@ -0,0 +1,80 @@
|
||||
// Package dorks provides YAML-embedded dork definitions and per-source
|
||||
// executors for the KeyHunter dork engine. The package mirrors the
|
||||
// pkg/providers pattern: dorks are authored as YAML files under
|
||||
// pkg/dorks/definitions/{source}/*.yaml and compiled into the binary via
|
||||
// go:embed, then loaded into an in-memory Registry at startup.
|
||||
package dorks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Dork is a single dork definition loaded from a YAML file under
|
||||
// pkg/dorks/definitions/{source}/*.yaml.
|
||||
type Dork struct {
|
||||
ID string `yaml:"id"`
|
||||
Name string `yaml:"name"`
|
||||
Source string `yaml:"source"` // github|google|shodan|censys|zoomeye|fofa|gitlab|bing
|
||||
Category string `yaml:"category"` // frontier|specialized|infrastructure|emerging|enterprise
|
||||
Query string `yaml:"query"`
|
||||
Description string `yaml:"description"`
|
||||
Tags []string `yaml:"tags"`
|
||||
}
|
||||
|
||||
// ValidSources enumerates the dork source backends recognised by the engine.
|
||||
// Executors for most of these arrive in later phases (OSINT 9-16); the
|
||||
// foundation plan only wires the GitHub executor live.
|
||||
var ValidSources = []string{
|
||||
"github",
|
||||
"google",
|
||||
"shodan",
|
||||
"censys",
|
||||
"zoomeye",
|
||||
"fofa",
|
||||
"gitlab",
|
||||
"bing",
|
||||
}
|
||||
|
||||
// ValidCategories enumerates the dork taxonomy buckets used for filtering.
|
||||
var ValidCategories = []string{
|
||||
"frontier",
|
||||
"specialized",
|
||||
"infrastructure",
|
||||
"emerging",
|
||||
"enterprise",
|
||||
}
|
||||
|
||||
// Validate returns a non-nil error when the dork is missing required fields
|
||||
// or declares an unknown source.
|
||||
func (d Dork) Validate() error {
|
||||
if strings.TrimSpace(d.ID) == "" {
|
||||
return fmt.Errorf("dork: id is required")
|
||||
}
|
||||
if strings.TrimSpace(d.Source) == "" {
|
||||
return fmt.Errorf("dork %q: source is required", d.ID)
|
||||
}
|
||||
if strings.TrimSpace(d.Query) == "" {
|
||||
return fmt.Errorf("dork %q: query is required", d.ID)
|
||||
}
|
||||
if !contains(ValidSources, d.Source) {
|
||||
return fmt.Errorf("dork %q: source %q is not one of %v", d.ID, d.Source, ValidSources)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stats holds aggregate counts over a Registry, returned by Registry.Stats.
|
||||
type Stats struct {
|
||||
Total int
|
||||
BySource map[string]int
|
||||
ByCategory map[string]int
|
||||
}
|
||||
|
||||
func contains(haystack []string, needle string) bool {
|
||||
for _, h := range haystack {
|
||||
if h == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user