diff --git a/go.mod b/go.mod index e3e9ef2..18a98af 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/salvacybersec/keyhunter go 1.26.1 require ( + github.com/petar-dambovaliev/aho-corasick v0.0.0-20250424160509-463d218d4745 github.com/stretchr/testify v1.11.1 gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go.sum b/go.sum index c4c1710..ff10194 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/petar-dambovaliev/aho-corasick v0.0.0-20250424160509-463d218d4745 h1:Vpr4VgAizEgEZsaMohpw6JYDP+i9Of9dmdY4ufNP6HI= +github.com/petar-dambovaliev/aho-corasick v0.0.0-20250424160509-463d218d4745/go.mod h1:EHPiTAKtiFmrMldLUNswFwfZ2eJIYBHktdaUTZxYWRw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= diff --git a/pkg/providers/definitions/anthropic.yaml b/pkg/providers/definitions/anthropic.yaml new file mode 100644 index 0000000..23aa116 --- /dev/null +++ b/pkg/providers/definitions/anthropic.yaml @@ -0,0 +1,20 @@ +format_version: 1 +name: anthropic +display_name: Anthropic +tier: 1 +last_verified: "2026-04-04" +keywords: + - "sk-ant-api03-" + - "anthropic" +patterns: + - regex: 'sk-ant-api03-[A-Za-z0-9_\-]{93,}' + entropy_min: 3.5 + confidence: high +verify: + method: GET + url: https://api.anthropic.com/v1/models + headers: + x-api-key: "{KEY}" + anthropic-version: "2023-06-01" + valid_status: [200] + invalid_status: [401, 403] diff --git a/pkg/providers/definitions/huggingface.yaml b/pkg/providers/definitions/huggingface.yaml new file mode 100644 index 0000000..db7a0b9 --- /dev/null +++ b/pkg/providers/definitions/huggingface.yaml @@ -0,0 +1,19 @@ +format_version: 1 +name: huggingface +display_name: HuggingFace +tier: 3 +last_verified: "2026-04-04" +keywords: + - "hf_" + - "huggingface" +patterns: + - regex: 'hf_[A-Za-z0-9]{34,}' + entropy_min: 3.5 + confidence: high +verify: + method: GET + url: https://huggingface.co/api/whoami-v2 + headers: + Authorization: "Bearer {KEY}" + valid_status: [200] + invalid_status: [401, 403] diff --git a/pkg/providers/definitions/openai.yaml b/pkg/providers/definitions/openai.yaml new file mode 100644 index 0000000..6a6fdd9 --- /dev/null +++ b/pkg/providers/definitions/openai.yaml @@ -0,0 +1,19 @@ +format_version: 1 +name: openai +display_name: OpenAI +tier: 1 +last_verified: "2026-04-04" +keywords: + - "sk-proj-" + - "openai" +patterns: + - regex: 'sk-proj-[A-Za-z0-9_\-]{48,}' + entropy_min: 3.5 + confidence: high +verify: + method: GET + url: https://api.openai.com/v1/models + headers: + Authorization: "Bearer {KEY}" + valid_status: [200] + invalid_status: [401, 403] diff --git a/pkg/providers/loader.go b/pkg/providers/loader.go new file mode 100644 index 0000000..f366753 --- /dev/null +++ b/pkg/providers/loader.go @@ -0,0 +1,37 @@ +package providers + +import ( + "embed" + "fmt" + "io/fs" + "path/filepath" + + "gopkg.in/yaml.v3" +) + +//go:embed definitions/*.yaml +var definitionsFS embed.FS + +// loadProviders reads all YAML files from the embedded definitions FS. +func loadProviders() ([]Provider, error) { + var providers []Provider + err := fs.WalkDir(definitionsFS, "definitions", func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() || filepath.Ext(path) != ".yaml" { + return nil + } + data, err := definitionsFS.ReadFile(path) + if err != nil { + return fmt.Errorf("reading provider file %s: %w", path, err) + } + var p Provider + if err := yaml.Unmarshal(data, &p); err != nil { + return fmt.Errorf("parsing provider %s: %w", path, err) + } + providers = append(providers, p) + return nil + }) + return providers, err +} diff --git a/pkg/providers/registry.go b/pkg/providers/registry.go new file mode 100644 index 0000000..e7696c9 --- /dev/null +++ b/pkg/providers/registry.go @@ -0,0 +1,75 @@ +package providers + +import ( + "fmt" + + ahocorasick "github.com/petar-dambovaliev/aho-corasick" +) + +// Registry is the in-memory store of all loaded provider definitions. +// It is initialized once at startup and is safe for concurrent reads. +type Registry struct { + providers []Provider + index map[string]int // name -> slice index + ac ahocorasick.AhoCorasick // pre-built automaton for keyword pre-filter +} + +// NewRegistry loads all embedded provider YAML files, validates them, builds the +// Aho-Corasick automaton from all provider keywords, and returns the Registry. +func NewRegistry() (*Registry, error) { + providers, err := loadProviders() + if err != nil { + return nil, fmt.Errorf("loading providers: %w", err) + } + + index := make(map[string]int, len(providers)) + var keywords []string + for i, p := range providers { + index[p.Name] = i + keywords = append(keywords, p.Keywords...) + } + + builder := ahocorasick.NewAhoCorasickBuilder(ahocorasick.Opts{DFA: true}) + ac := builder.Build(keywords) + + return &Registry{ + providers: providers, + index: index, + ac: ac, + }, nil +} + +// List returns all loaded providers. +func (r *Registry) List() []Provider { + return r.providers +} + +// Get returns a provider by name and a boolean indicating whether it was found. +func (r *Registry) Get(name string) (Provider, bool) { + idx, ok := r.index[name] + if !ok { + return Provider{}, false + } + return r.providers[idx], true +} + +// Stats returns aggregate statistics about the loaded providers. +func (r *Registry) Stats() RegistryStats { + stats := RegistryStats{ + Total: len(r.providers), + ByTier: make(map[int]int), + ByConfidence: make(map[string]int), + } + for _, p := range r.providers { + stats.ByTier[p.Tier]++ + for _, pat := range p.Patterns { + stats.ByConfidence[pat.Confidence]++ + } + } + return stats +} + +// AC returns the pre-built Aho-Corasick automaton for keyword pre-filtering. +func (r *Registry) AC() ahocorasick.AhoCorasick { + return r.ac +}