diff --git a/.planning/phases/01-foundation/01-02-PLAN.md b/.planning/phases/01-foundation/01-02-PLAN.md index ea31e83..377e5aa 100644 --- a/.planning/phases/01-foundation/01-02-PLAN.md +++ b/.planning/phases/01-foundation/01-02-PLAN.md @@ -127,16 +127,15 @@ The embed directive must reference providers relative to loader.go location. loader.go is at pkg/providers/loader.go. providers/ directory is at project root. Use: //go:embed ../../providers/*.yaml -and embed.FS path will be "../../providers/openai.yaml" etc. -Actually: Go embed paths must be relative and cannot use "..". +Actually: Go embed paths must be relative and cannot use "..". Correct approach: place the embed in a file at project root level, or adjust. -Better approach from research: put loader in providers package, embed from pkg/providers, +Better approach from research: put loader in providers package, embed from pkg/providers, but reference the providers/ dir which sits at root. Resolution: The go:embed directive path is relative to the SOURCE FILE, not the module root. Since loader.go is at pkg/providers/loader.go, to embed ../../providers/*.yaml would work -syntactically but Go's embed restricts paths containing "..". +syntactically but Go's embed restricts paths containing "..". Use this instead: place a providers_embed.go at the PROJECT ROOT (same dir as go.mod): package main -- NO, this breaks package separation @@ -408,6 +407,8 @@ Create **pkg/providers/registry.go**: package providers import ( + "fmt" + ahocorasick "github.com/petar-dambovaliev/aho-corasick" ) @@ -480,8 +481,6 @@ func (r *Registry) AC() ahocorasick.AhoCorasick { } ``` -Note: registry.go needs `import "fmt"` added. - Then copy the three YAML files into the embed location: ```bash mkdir -p /home/salva/Documents/apikey/pkg/providers/definitions @@ -490,76 +489,8 @@ cp /home/salva/Documents/apikey/providers/anthropic.yaml /home/salva/Documents/a cp /home/salva/Documents/apikey/providers/huggingface.yaml /home/salva/Documents/apikey/pkg/providers/definitions/ ``` -Finally, fill in **pkg/providers/registry_test.go** (replacing the stubs from Plan 01): -```go -package providers_test - -import ( - "testing" - - "github.com/salvacybersec/keyhunter/pkg/providers" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestRegistryLoad(t *testing.T) { - reg, err := providers.NewRegistry() - require.NoError(t, err) - assert.GreaterOrEqual(t, len(reg.List()), 3, "expected at least 3 providers loaded") -} - -func TestRegistryGet(t *testing.T) { - reg, err := providers.NewRegistry() - require.NoError(t, err) - - p, ok := reg.Get("openai") - assert.True(t, ok) - assert.Equal(t, "openai", p.Name) - assert.Equal(t, 1, p.Tier) - - _, ok = reg.Get("nonexistent-provider") - assert.False(t, ok) -} - -func TestRegistryStats(t *testing.T) { - reg, err := providers.NewRegistry() - require.NoError(t, err) - - stats := reg.Stats() - assert.GreaterOrEqual(t, stats.Total, 3) - assert.GreaterOrEqual(t, stats.ByTier[1], 2, "expected at least 2 tier-1 providers") -} - -func TestAhoCorasickBuild(t *testing.T) { - reg, err := providers.NewRegistry() - require.NoError(t, err) - - ac := reg.AC() - - // Should match OpenAI keyword - matches := ac.FindAll("OPENAI_API_KEY=sk-proj-abc") - assert.NotEmpty(t, matches, "expected AC to find keyword in string containing 'sk-proj-'") - - // Should not match clean text - noMatches := ac.FindAll("hello world no secrets here") - assert.Empty(t, noMatches, "expected no AC matches in text with no provider keywords") -} - -func TestProviderSchemaValidation(t *testing.T) { - import_yaml := ` -format_version: 0 -name: invalid -last_verified: "" -` - // Directly test UnmarshalYAML via yaml.Unmarshal - var p providers.Provider - err := yaml.Unmarshal([]byte(import_yaml), &p) // NOTE: need import "gopkg.in/yaml.v3" - assert.Error(t, err, "expected validation error for format_version=0") -} -``` - -Note: The TestProviderSchemaValidation test needs `import "gopkg.in/yaml.v3"` added. -Add it to the imports. Full corrected test file with proper imports: +Finally, fill in **pkg/providers/registry_test.go** (replacing the stubs from Plan 01). +Write ONLY the following content — do not include any earlier draft versions: ```go package providers_test @@ -632,7 +563,7 @@ func TestProviderSchemaValidation(t *testing.T) { - TestRegistryStats passes — Total >= 3 - TestAhoCorasickBuild passes — "sk-proj-" match found, "hello world" empty - TestProviderSchemaValidation passes — error on format_version=0 - - `grep -r 'go:embed' pkg/providers/loader.go` exits 0 + - `grep -q 'go:embed' pkg/providers/loader.go` exits 0 - pkg/providers/definitions/ directory exists with 3 YAML files Registry loads providers from embedded YAML, builds Aho-Corasick automaton, exposes List/Get/Stats/AC. All 5 tests pass. diff --git a/.planning/phases/01-foundation/01-03-PLAN.md b/.planning/phases/01-foundation/01-03-PLAN.md index b081e0d..0746efb 100644 --- a/.planning/phases/01-foundation/01-03-PLAN.md +++ b/.planning/phases/01-foundation/01-03-PLAN.md @@ -20,7 +20,7 @@ must_haves: - "AES-256-GCM Encrypt/Decrypt roundtrip produces the original plaintext" - "Argon2id DeriveKey with the same passphrase and salt always returns the same 32-byte key" - "A Finding can be saved to the database with the key_value stored encrypted and retrieved as plaintext" - - "The raw database file does NOT contain plaintext API key values" + - "The raw database file does NOT contain plaintext API key values — verified by querying raw bytes from the BLOB column" artifacts: - path: "pkg/storage/encrypt.go" provides: "Encrypt(plaintext, key) and Decrypt(ciphertext, key) using AES-256-GCM" @@ -271,7 +271,7 @@ func NewSalt() ([]byte, error) { - Test 3: DeriveKey(passphrase, salt) twice returns identical 32 bytes - Test 4: NewSalt() twice returns different slices - Test 5: SaveFinding stores finding → ListFindings decrypts and returns KeyValue == "sk-proj-test" - - Test 6: Database file (when not :memory:) does NOT contain literal "sk-proj-test" in raw bytes + - Test 6: Raw BLOB bytes retrieved directly from the database do NOT contain the plaintext key string Create **pkg/storage/schema.sql**: @@ -473,6 +473,7 @@ Fill **pkg/storage/db_test.go** (replacing stubs from Plan 01): package storage_test import ( + "bytes" "testing" "github.com/salvacybersec/keyhunter/pkg/storage" @@ -567,9 +568,10 @@ func TestSaveFindingEncrypted(t *testing.T) { // Derive a test key key := storage.DeriveKey([]byte("testpassphrase"), []byte("testsalt1234xxxx")) + plainKey := "sk-proj-test1234567890abcdefghijklmnopqr" f := storage.Finding{ ProviderName: "openai", - KeyValue: "sk-proj-test1234567890abcdefghijklmnopqr", + KeyValue: plainKey, Confidence: "high", SourcePath: "/test/file.env", SourceType: "file", @@ -583,10 +585,17 @@ func TestSaveFindingEncrypted(t *testing.T) { findings, err := db.ListFindings(key) require.NoError(t, err) require.Len(t, findings, 1) - assert.Equal(t, "sk-proj-test1234567890abcdefghijklmnopqr", findings[0].KeyValue) + assert.Equal(t, plainKey, findings[0].KeyValue) assert.Equal(t, "openai", findings[0].ProviderName) // Verify masking assert.Equal(t, "sk-proj-...opqr", findings[0].KeyMasked) + + // Verify encryption contract: raw BLOB bytes in the database must NOT contain the plaintext key. + // This confirms Encrypt() was called before INSERT, not that the key was stored verbatim. + var rawBlob []byte + require.NoError(t, db.SQL().QueryRow("SELECT key_value FROM findings WHERE id = ?", id).Scan(&rawBlob)) + assert.False(t, bytes.Contains(rawBlob, []byte(plainKey)), + "raw database BLOB must not contain plaintext key — encryption was not applied") } ``` @@ -601,12 +610,12 @@ func TestSaveFindingEncrypted(t *testing.T) { - TestDecryptWrongKey passes — wrong key causes error - TestArgon2KeyDerivation passes — 32 bytes, deterministic - TestNewSalt passes — 16 bytes, non-deterministic - - TestSaveFindingEncrypted passes — stored and retrieved with correct KeyValue and KeyMasked + - TestSaveFindingEncrypted passes — stored and retrieved with correct KeyValue, KeyMasked, AND raw BLOB does not contain plaintext - `grep -q 'go:embed.*schema' pkg/storage/db.go` exits 0 - `grep -q 'modernc.org/sqlite' pkg/storage/db.go` exits 0 - `grep -q 'journal_mode=WAL' pkg/storage/db.go` exits 0 - Storage layer complete — SQLite opens with schema, AES-256-GCM encrypt/decrypt works, Argon2id key derivation works, SaveFinding/ListFindings encrypt/decrypt transparently. All 7 tests pass. + Storage layer complete — SQLite opens with schema, AES-256-GCM encrypt/decrypt works, Argon2id key derivation works, SaveFinding/ListFindings encrypt/decrypt transparently. Raw BLOB bytes verified to not contain plaintext. All 7 tests pass. @@ -619,6 +628,7 @@ After both tasks: - `grep -q 'cipher\.NewGCM' pkg/storage/encrypt.go` exits 0 - `grep -q 'journal_mode=WAL' pkg/storage/db.go` exits 0 - schema.sql contains CREATE TABLE for findings, scans, settings +- TestSaveFindingEncrypted asserts raw BLOB does not contain plaintext key @@ -626,6 +636,7 @@ After both tasks: - AES-256-GCM column encryption works: Encrypt + Decrypt roundtrip returns original (STOR-02) - Argon2id key derivation: DeriveKey deterministic, 32 bytes, RFC 9106 params (STOR-03) - FindingCRUD: SaveFinding encrypts before INSERT, ListFindings decrypts after SELECT +- Raw BLOB in database does not contain plaintext key — verified by automated test - All 7 storage tests pass diff --git a/.planning/phases/01-foundation/01-04-PLAN.md b/.planning/phases/01-foundation/01-04-PLAN.md index 0fc8b57..8703d8e 100644 --- a/.planning/phases/01-foundation/01-04-PLAN.md +++ b/.planning/phases/01-foundation/01-04-PLAN.md @@ -5,7 +5,7 @@ type: execute wave: 2 depends_on: [01-02] files_modified: - - pkg/engine/chunk.go + - pkg/types/chunk.go - pkg/engine/finding.go - pkg/engine/entropy.go - pkg/engine/filter.go @@ -15,7 +15,7 @@ files_modified: - pkg/engine/sources/file.go - pkg/engine/scanner_test.go autonomous: true -requirements: [CORE-01, CORE-04, CORE-05, CORE-06, CORE-07] +requirements: [CORE-01, CORE-04, CORE-05, CORE-06] must_haves: truths: @@ -26,8 +26,8 @@ must_haves: - "Full scan pipeline: scan testdata/samples/no_keys.txt → zero findings" - "Worker pool uses ants v2 with configurable worker count" artifacts: - - path: "pkg/engine/chunk.go" - provides: "Chunk struct (Data []byte, Source string, Offset int64)" + - path: "pkg/types/chunk.go" + provides: "Chunk struct (Data []byte, Source string, Offset int64) — shared by engine and sources packages" exports: ["Chunk"] - path: "pkg/engine/finding.go" provides: "Finding struct (provider, key value, masked, confidence, source, line)" @@ -40,12 +40,12 @@ must_haves: exports: ["KeywordFilter"] - path: "pkg/engine/detector.go" provides: "Detector stage — applies provider regexps and entropy check to chunks" - exports: ["Detector"] + exports: ["Detect"] - path: "pkg/engine/engine.go" provides: "Engine struct with Scan(ctx, src, cfg) <-chan Finding" exports: ["Engine", "NewEngine", "ScanConfig"] - path: "pkg/engine/sources/source.go" - provides: "Source interface with Chunks(ctx, chan<- Chunk) error" + provides: "Source interface with Chunks(ctx, chan<- types.Chunk) error" exports: ["Source"] - path: "pkg/engine/sources/file.go" provides: "FileSource implementing Source for single-file scanning" @@ -67,13 +67,19 @@ must_haves: to: "github.com/panjf2000/ants/v2" via: "ants.NewPool for detector workers" pattern: "ants\\.NewPool" + - from: "pkg/engine/sources/source.go" + to: "pkg/types/chunk.go" + via: "Source interface uses types.Chunk — avoids circular import with pkg/engine" + pattern: "types\\.Chunk" --- Build the three-stage scanning engine pipeline: Aho-Corasick keyword pre-filter, regex + entropy detector workers using ants goroutine pool, and a FileSource adapter. Wire them together in an Engine that emits Findings on a channel. Purpose: The scan engine is the core differentiator. Plans 02 and 03 provide its dependencies (Registry for patterns + keywords, storage types for Finding). The CLI (Plan 05) calls Engine.Scan() to implement `keyhunter scan`. -Output: pkg/engine/{chunk,finding,entropy,filter,detector,engine}.go and sources/{source,file}.go. scanner_test.go stubs filled. +Output: pkg/types/chunk.go, pkg/engine/{finding,entropy,filter,detector,engine}.go and sources/{source,file}.go. scanner_test.go stubs filled. + +NOTE on CORE-07 (mmap large file reading): FileSource uses os.ReadFile() in Phase 1, which is sufficient for the test fixtures. mmap-based reading for files > 10MB is deferred to Phase 4 (Input Sources) where it belongs architecturally alongside all other source adapter work. @@ -86,6 +92,16 @@ Output: pkg/engine/{chunk,finding,entropy,filter,detector,engine}.go and sources @.planning/phases/01-foundation/01-02-SUMMARY.md + +The sources sub-package (pkg/engine/sources) needs the Chunk type. +If Chunk were defined in pkg/engine, then sources would import engine, and engine imports +sources (for the Source interface) — a circular import. Go will refuse to compile. + +Resolution: Define Chunk in pkg/types (a shared, import-free package): + pkg/types/chunk.go — defines types.Chunk + pkg/engine/sources — imports pkg/types (no circular dep) + pkg/engine — imports pkg/types and pkg/engine/sources (no circular dep) + package providers @@ -107,9 +123,9 @@ func (r *Registry) List() []Provider func (r *Registry) AC() ahocorasick.AhoCorasick // pre-built Aho-Corasick -chunksChan chan Chunk (buffer: 1000) -detectableChan chan Chunk (buffer: 500) -resultsChan chan Finding (buffer: 100) +chunksChan chan types.Chunk (buffer: 1000) +detectableChan chan types.Chunk (buffer: 500) +resultsChan chan Finding (buffer: 100) Stage 1: Source.Chunks() → chunksChan (goroutine, closes chan on done) Stage 2: KeywordFilter(chunksChan) → detectableChan (goroutine, AC.FindAll) @@ -124,7 +140,7 @@ type ScanConfig struct { type Source interface { - Chunks(ctx context.Context, out chan<- Chunk) error + Chunks(ctx context.Context, out chan<- types.Chunk) error } @@ -151,8 +167,8 @@ import "github.com/panjf2000/ants/v2" - Task 1: Core types and Shannon entropy function - pkg/engine/chunk.go, pkg/engine/finding.go, pkg/engine/entropy.go + Task 1: Shared types package, Finding, and Shannon entropy function + pkg/types/chunk.go, pkg/engine/finding.go, pkg/engine/entropy.go - /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (CORE-04 row: Shannon entropy, ~10-line stdlib function, threshold 3.5 bits/char) - /home/salva/Documents/apikey/pkg/storage/findings.go (Finding and MaskKey defined there — engine.Finding is a separate type for the pipeline) @@ -166,11 +182,13 @@ import "github.com/panjf2000/ants/v2" - Test 6: MaskKey("abc") → "****" (too short to mask) -Create **pkg/engine/chunk.go**: +Create **pkg/types/chunk.go** — the shared type that breaks the circular import: ```go -package engine +package types // Chunk is a segment of file content passed through the scanning pipeline. +// Defined in pkg/types (not pkg/engine) so that pkg/engine/sources can use it +// without creating a circular import with pkg/engine. type Chunk struct { Data []byte // raw bytes Source string // file path, URL, or description @@ -236,18 +254,18 @@ func Shannon(s string) float64 { ``` - cd /home/salva/Documents/apikey && go build ./pkg/engine/... && echo "BUILD OK" + cd /home/salva/Documents/apikey && go build ./pkg/types/... && go build ./pkg/engine/... && echo "BUILD OK" + - `go build ./pkg/types/...` exits 0 - `go build ./pkg/engine/...` exits 0 - - pkg/engine/chunk.go exports Chunk with fields Data, Source, Offset + - pkg/types/chunk.go exports Chunk with fields Data, Source, Offset - pkg/engine/finding.go exports Finding and MaskKey - pkg/engine/entropy.go exports Shannon using math.Log2 - `grep -q 'math\.Log2' pkg/engine/entropy.go` exits 0 - - Shannon("aaaaaaa") == 0.0 (manually verifiable from code) - MaskKey("sk-proj-abc1234") produces "sk-proj-...1234" - Chunk, Finding, MaskKey, and Shannon exist and compile. Shannon uses stdlib math only — no external library. + pkg/types/Chunk exists (no imports, no circular dependency risk), Finding, MaskKey, and Shannon exist and compile. @@ -262,7 +280,8 @@ func Shannon(s string) float64 { - /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (Pattern 2: Three-Stage Scanning Pipeline — exact channel-based code example) - - /home/salva/Documents/apikey/pkg/engine/chunk.go + - /home/salva/Documents/apikey/pkg/types/chunk.go + - /home/salva/Documents/apikey/pkg/engine/chunk.go (if exists — use pkg/types/chunk.go instead) - /home/salva/Documents/apikey/pkg/engine/finding.go - /home/salva/Documents/apikey/pkg/engine/entropy.go - /home/salva/Documents/apikey/pkg/providers/registry.go (Registry.AC() and Registry.List() signatures) @@ -283,13 +302,15 @@ package sources import ( "context" - "github.com/salvacybersec/keyhunter/pkg/engine" + "github.com/salvacybersec/keyhunter/pkg/types" ) // Source is the interface all input adapters must implement. // Chunks writes content segments to the out channel until the source is exhausted or ctx is cancelled. +// NOTE: Source is defined in the sources sub-package (not pkg/engine) and uses pkg/types.Chunk +// to avoid a circular import: engine → sources → engine. type Source interface { - Chunks(ctx context.Context, out chan<- engine.Chunk) error + Chunks(ctx context.Context, out chan<- types.Chunk) error } ``` @@ -301,7 +322,7 @@ import ( "context" "os" - "github.com/salvacybersec/keyhunter/pkg/engine" + "github.com/salvacybersec/keyhunter/pkg/types" ) const defaultChunkSize = 4096 @@ -319,7 +340,9 @@ func NewFileSource(path string) *FileSource { } // Chunks reads the file in overlapping segments and sends each chunk to out. -func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error { +// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented +// in Phase 4 (Input Sources) alongside all other source adapter enhancements. +func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { data, err := os.ReadFile(f.Path) if err != nil { return err @@ -333,7 +356,7 @@ func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error select { case <-ctx.Done(): return ctx.Err() - case out <- engine.Chunk{Data: data, Source: f.Path, Offset: 0}: + case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}: } return nil } @@ -344,7 +367,7 @@ func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error if end > len(data) { end = len(data) } - chunk := engine.Chunk{ + chunk := types.Chunk{ Data: data[start:end], Source: f.Path, Offset: offset, @@ -369,12 +392,13 @@ package engine import ( ahocorasick "github.com/petar-dambovaliev/aho-corasick" + "github.com/salvacybersec/keyhunter/pkg/types" ) // KeywordFilter filters a stream of chunks using an Aho-Corasick automaton. // Only chunks that contain at least one provider keyword are sent to out. // This is Stage 2 of the pipeline (runs after Source, before Detector). -func KeywordFilter(ac ahocorasick.AhoCorasick, in <-chan Chunk, out chan<- Chunk) { +func KeywordFilter(ac ahocorasick.AhoCorasick, in <-chan types.Chunk, out chan<- types.Chunk) { for chunk := range in { if len(ac.FindAll(string(chunk.Data))) > 0 { out <- chunk @@ -393,11 +417,12 @@ import ( "time" "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/types" ) -// Detector applies provider regex patterns and optional entropy checks to a chunk. +// Detect applies provider regex patterns and optional entropy checks to a chunk. // It returns all findings from the chunk. -func Detect(chunk Chunk, providerList []providers.Provider) []Finding { +func Detect(chunk types.Chunk, providerList []providers.Provider) []Finding { var findings []Finding content := string(chunk.Data) @@ -452,8 +477,9 @@ import ( "time" "github.com/panjf2000/ants/v2" - "github.com/salvacybersec/keyhunter/pkg/providers" "github.com/salvacybersec/keyhunter/pkg/engine/sources" + "github.com/salvacybersec/keyhunter/pkg/providers" + "github.com/salvacybersec/keyhunter/pkg/types" ) // ScanConfig controls scan execution parameters. @@ -482,9 +508,10 @@ func (e *Engine) Scan(ctx context.Context, src sources.Source, cfg ScanConfig) ( workers = runtime.NumCPU() * 8 } - chunksChan := make(chan Chunk, 1000) - detectableChan := make(chan Chunk, 500) - resultsChan := make(chan Finding, 100) + // Declare channels on separate lines to ensure correct Go syntax. + chunksChan := make(chan types.Chunk, 1000) + detectableChan := make(chan types.Chunk, 500) + resultsChan := make(chan Finding, 100) // Stage 1: source → chunksChan go func() { @@ -517,7 +544,7 @@ func (e *Engine) Scan(ctx context.Context, src sources.Source, cfg ScanConfig) ( }() for chunk := range detectableChan { - c := chunk // capture + c := chunk // capture loop variable wg.Add(1) _ = pool.Submit(func() { defer wg.Done() @@ -645,6 +672,7 @@ func TestScannerPipelineMultipleKeys(t *testing.T) { - `go test ./pkg/engine/... -v -count=1` exits 0 with all tests PASS (no SKIP) + - `go build ./...` exits 0 with no circular import errors - TestShannonEntropy passes — 0.0 for "aaaaaaa", >= 3.5 for real key pattern - TestKeywordPreFilter passes — AC matches sk-proj-, empty for "hello world" - TestScannerPipelineOpenAI passes — 1 finding with ProviderName=="openai" @@ -652,19 +680,20 @@ func TestScannerPipelineMultipleKeys(t *testing.T) { - TestScannerPipelineMultipleKeys passes — >= 2 findings with both provider names - `grep -q 'ants\.NewPool' pkg/engine/engine.go` exits 0 - `grep -q 'KeywordFilter' pkg/engine/engine.go` exits 0 - - `go build ./...` still exits 0 + - pkg/types/chunk.go exists and pkg/engine/sources imports pkg/types (not pkg/engine) - Three-stage scanning pipeline works end-to-end: FileSource → KeywordFilter (AC) → Detect (regex + entropy) → Finding channel. All engine tests pass. + Three-stage scanning pipeline works end-to-end: FileSource → KeywordFilter (AC) → Detect (regex + entropy) → Finding channel. Circular import resolved via pkg/types. All engine tests pass. After both tasks: -- `go test ./pkg/engine/... -v -count=1` exits 0 with 6 tests PASS -- `go build ./...` exits 0 +- `go build ./...` exits 0 with zero circular import errors +- `go test ./pkg/engine/... -v -count=1` exits 0 with all tests PASS - `grep -q 'ants\.NewPool' pkg/engine/engine.go` exits 0 - `grep -q 'math\.Log2' pkg/engine/entropy.go` exits 0 +- `grep -rq 'pkg/types' pkg/engine/sources/source.go` exits 0 (sources imports types, not engine) - Scanning testdata/samples/openai_key.txt returns 1 finding with provider "openai" - Scanning testdata/samples/no_keys.txt returns 0 findings @@ -673,7 +702,8 @@ After both tasks: - Three-stage pipeline: AC pre-filter → regex + entropy detector → results channel (CORE-01, CORE-06) - Shannon entropy function using stdlib math (CORE-04) - ants v2 goroutine pool with configurable worker count (CORE-05) -- FileSource adapter reading files in overlapping chunks (CORE-07 partial — full mmap in Phase 4) +- FileSource adapter reading files in overlapping chunks using os.ReadFile (mmap deferred to Phase 4) +- pkg/types/Chunk breaks the engine↔sources circular import - All engine tests pass against real testdata fixtures diff --git a/.planning/phases/01-foundation/01-05-PLAN.md b/.planning/phases/01-foundation/01-05-PLAN.md index fb95433..bcea4d2 100644 --- a/.planning/phases/01-foundation/01-05-PLAN.md +++ b/.planning/phases/01-foundation/01-05-PLAN.md @@ -9,9 +9,10 @@ files_modified: - cmd/scan.go - cmd/providers.go - cmd/config.go + - cmd/stubs.go - pkg/config/config.go - pkg/output/table.go -autonomous: false +autonomous: true requirements: [CLI-01, CLI-02, CLI-03, CLI-04, CLI-05] must_haves: @@ -21,14 +22,19 @@ must_haves: - "`keyhunter providers info openai` prints OpenAI provider details" - "`keyhunter config init` creates ~/.keyhunter.yaml without error" - "`keyhunter config set workers 16` persists the value to ~/.keyhunter.yaml" - - "`keyhunter --help` shows all top-level commands: scan, providers, config" + - "`keyhunter --help` shows all top-level commands: scan, verify, import, recon, keys, serve, dorks, hook, schedule, providers, config" + - "Findings are stored with a per-installation salt loaded from the settings table — not a hardcoded salt" + - "Raw sqlite3 query on the database file does NOT return plaintext key values" artifacts: - path: "cmd/root.go" provides: "Cobra root command with PersistentPreRunE config loading" contains: "cobra.Command" - path: "cmd/scan.go" - provides: "scan command wiring Engine + FileSource + output table" + provides: "scan command wiring Engine + FileSource + output table + salt from settings" exports: ["scanCmd"] + - path: "cmd/stubs.go" + provides: "stub commands for verify, import, recon, keys, serve, dorks, hook, schedule" + exports: ["verifyCmd", "importCmd", "reconCmd", "keysCmd", "serveCmd", "dorksCmd", "hookCmd", "scheduleCmd"] - path: "cmd/providers.go" provides: "providers list/info/stats subcommands using Registry" exports: ["providersCmd"] @@ -50,6 +56,10 @@ must_haves: to: "pkg/storage/db.go" via: "storage.Open() called, SaveFinding for each result" pattern: "storage\\.Open" + - from: "cmd/scan.go" + to: "pkg/storage/crypto.go" + via: "loadOrCreateSalt() reads salt from settings table via storage.GetSetting/SetSetting, then calls storage.DeriveKey" + pattern: "DeriveKey|GetSetting|SetSetting" - from: "cmd/root.go" to: "github.com/spf13/viper" via: "viper.SetConfigFile in PersistentPreRunE" @@ -61,10 +71,10 @@ must_haves: --- -Wire all subsystems together through the Cobra CLI: scan command (engine + storage + output), providers list/info/stats commands, and config init/set/get commands. This is the integration layer — all business logic lives in pkg/, cmd/ only wires. +Wire all subsystems together through the Cobra CLI: scan command (engine + storage + output), providers list/info/stats commands, config init/set/get commands, and 8 stub commands for future phases. This is the integration layer — all business logic lives in pkg/, cmd/ only wires. Purpose: Satisfies all Phase 1 CLI requirements and delivers the first working `keyhunter scan` command that completes the end-to-end success criteria. -Output: cmd/{root,scan,providers,config}.go, pkg/config/config.go, pkg/output/table.go. +Output: cmd/{root,scan,providers,config,stubs}.go, pkg/config/config.go, pkg/output/table.go. @@ -106,6 +116,10 @@ func (db *DB) SaveFinding(f Finding, encKey []byte) (int64, error) func DeriveKey(passphrase []byte, salt []byte) []byte func NewSalt() ([]byte, error) + +func (db *DB) GetSetting(key string) (string, bool, error) +func (db *DB) SetSetting(key string, value string) error + package providers func NewRegistry() (*Registry, error) @@ -127,17 +141,24 @@ Passphrase: (prompt if not in env KEYHUNTER_PASSPHRASE — Phase 1: use empty s Columns: PROVIDER | MASKED KEY | CONFIDENCE | SOURCE | LINE Colors: use lipgloss.NewStyle().Foreground() for confidence: high=green, medium=yellow, low=red + + +On first scan, call storage.NewSalt(), hex-encode it, store in settings table with key "encryption.salt". +On subsequent scans, read the salt from the settings table. +This ensures all users have a unique per-installation salt instead of a shared hardcoded salt. +The helper function loadOrCreateSalt(db *storage.DB) ([]byte, error) handles both cases. - Task 1: Config package, output table, and root command - pkg/config/config.go, pkg/output/table.go, cmd/root.go + Task 1: Config package, output table, root command, and settings helpers + pkg/config/config.go, pkg/output/table.go, cmd/root.go, pkg/storage/settings.go - /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (CLI-01, CLI-02, CLI-03 rows, Standard Stack: cobra v1.10.2 + viper v1.21.0) - /home/salva/Documents/apikey/pkg/engine/finding.go (Finding struct fields for output) + - /home/salva/Documents/apikey/pkg/storage/db.go (DB struct, to add GetSetting/SetSetting) Create **pkg/config/config.go**: @@ -172,6 +193,43 @@ func Load() Config { } ``` +Create **pkg/storage/settings.go** — adds GetSetting/SetSetting to the storage package: +```go +package storage + +import ( + "database/sql" + "fmt" +) + +// GetSetting retrieves a value from the settings table. +// Returns (value, true, nil) if found, ("", false, nil) if not found, ("", false, err) on error. +func (db *DB) GetSetting(key string) (string, bool, error) { + var value string + err := db.sql.QueryRow("SELECT value FROM settings WHERE key = ?", key).Scan(&value) + if err == sql.ErrNoRows { + return "", false, nil + } + if err != nil { + return "", false, fmt.Errorf("getting setting %q: %w", key, err) + } + return value, true, nil +} + +// SetSetting inserts or updates a key-value pair in the settings table. +func (db *DB) SetSetting(key, value string) error { + _, err := db.sql.Exec( + `INSERT INTO settings (key, value, updated_at) VALUES (?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = CURRENT_TIMESTAMP`, + key, value, + ) + if err != nil { + return fmt.Errorf("setting %q: %w", key, err) + } + return nil +} +``` + Create **pkg/output/table.go**: ```go package output @@ -281,6 +339,15 @@ func init() { rootCmd.AddCommand(scanCmd) rootCmd.AddCommand(providersCmd) rootCmd.AddCommand(configCmd) + // Stub commands for future phases (per CLI-01 requirement of 11 commands) + rootCmd.AddCommand(verifyCmd) + rootCmd.AddCommand(importCmd) + rootCmd.AddCommand(reconCmd) + rootCmd.AddCommand(keysCmd) + rootCmd.AddCommand(serveCmd) + rootCmd.AddCommand(dorksCmd) + rootCmd.AddCommand(hookCmd) + rootCmd.AddCommand(scheduleCmd) } func initConfig() { @@ -316,28 +383,30 @@ func mustHomeDir() string { ``` - cd /home/salva/Documents/apikey && go build ./... && ./keyhunter --help 2>&1 | grep -E "scan|providers|config" && echo "HELP OK" + cd /home/salva/Documents/apikey && go build ./... && ./keyhunter --help 2>&1 | grep -E "scan|providers|config|verify|recon|keys|serve|dorks|hook|schedule" && echo "HELP OK" - `go build ./...` exits 0 - - `./keyhunter --help` shows "scan", "providers", and "config" in command list + - `./keyhunter --help` shows scan, providers, config, verify, import, recon, keys, serve, dorks, hook, schedule in command list - pkg/config/config.go exports Config and Load - pkg/output/table.go exports PrintFindings - - cmd/root.go declares rootCmd, Execute(), scanCmd, providersCmd, configCmd referenced + - pkg/storage/settings.go exports GetSetting and SetSetting + - cmd/root.go declares rootCmd, Execute(), and adds all 11 subcommands - `grep -q 'viper\.SetConfigFile\|viper\.SetConfigName' cmd/root.go` exits 0 - lipgloss used for header and confidence coloring - Root command, config package, and output table exist. `keyhunter --help` shows the three top-level commands. + Root command registers all 11 CLI commands. Config package, output table, and settings helpers exist. `keyhunter --help` shows all commands. - Task 2: scan, providers, and config subcommands - cmd/scan.go, cmd/providers.go, cmd/config.go + Task 2: scan, providers, config subcommands, and stub commands + cmd/scan.go, cmd/providers.go, cmd/config.go, cmd/stubs.go - /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (CLI-04, CLI-05 rows, Pattern 2 pipeline usage) - /home/salva/Documents/apikey/cmd/root.go (rootCmd, viper setup) - /home/salva/Documents/apikey/pkg/engine/engine.go (Engine.Scan, ScanConfig) - /home/salva/Documents/apikey/pkg/storage/db.go (Open, SaveFinding) + - /home/salva/Documents/apikey/pkg/storage/settings.go (GetSetting, SetSetting) - /home/salva/Documents/apikey/pkg/providers/registry.go (NewRegistry, List, Get, Stats) @@ -347,6 +416,8 @@ package cmd import ( "context" + "encoding/hex" + "encoding/json" "fmt" "os" "path/filepath" @@ -422,9 +493,13 @@ var scanCmd = &cobra.Command{ } defer db.Close() - // Derive encryption key (Phase 1: empty passphrase with fixed dev salt) - salt := []byte("keyhunter-dev-s0") // Phase 1 placeholder — Phase 6 replaces with proper salt storage - encKey := storage.DeriveKey([]byte(cfg.Passphrase), salt) + // Derive encryption key using a per-installation salt stored in settings table. + // On first run, NewSalt() generates a random salt and stores it. + // On subsequent runs, the same salt is loaded — ensuring consistent encryption. + encKey, err := loadOrCreateEncKey(db, cfg.Passphrase) + if err != nil { + return fmt.Errorf("preparing encryption key: %w", err) + } // Run scan ch, err := eng.Scan(context.Background(), src, scanCfg) @@ -453,8 +528,29 @@ var scanCmd = &cobra.Command{ // Output switch flagOutput { case "json": - // Phase 6 — basic JSON for now - fmt.Printf("[] # JSON output: Phase 6\n") + // Return valid empty JSON array when no findings; full JSON in Phase 6. + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + type jsonFinding struct { + Provider string `json:"provider"` + KeyMasked string `json:"key_masked"` + Confidence string `json:"confidence"` + Source string `json:"source"` + Line int `json:"line"` + } + out := make([]jsonFinding, 0, len(findings)) + for _, f := range findings { + out = append(out, jsonFinding{ + Provider: f.ProviderName, + KeyMasked: f.KeyMasked, + Confidence: f.Confidence, + Source: f.Source, + Line: f.LineNumber, + }) + } + if err := enc.Encode(out); err != nil { + return fmt.Errorf("encoding JSON output: %w", err) + } default: output.PrintFindings(findings, flagUnmask) } @@ -467,11 +563,39 @@ var scanCmd = &cobra.Command{ }, } +// loadOrCreateEncKey loads the per-installation salt from the settings table. +// On first run it generates a new random salt with storage.NewSalt() and persists it. +// The salt is hex-encoded in the settings table under key "encryption.salt". +func loadOrCreateEncKey(db *storage.DB, passphrase string) ([]byte, error) { + const saltKey = "encryption.salt" + saltHex, found, err := db.GetSetting(saltKey) + if err != nil { + return nil, fmt.Errorf("reading salt from settings: %w", err) + } + var salt []byte + if !found { + // First run: generate and persist a new random salt. + salt, err = storage.NewSalt() + if err != nil { + return nil, fmt.Errorf("generating salt: %w", err) + } + if err := db.SetSetting(saltKey, hex.EncodeToString(salt)); err != nil { + return nil, fmt.Errorf("storing salt: %w", err) + } + } else { + salt, err = hex.DecodeString(saltHex) + if err != nil { + return nil, fmt.Errorf("decoding stored salt: %w", err) + } + } + return storage.DeriveKey([]byte(passphrase), salt), nil +} + func init() { scanCmd.Flags().IntVar(&flagWorkers, "workers", 0, "number of worker goroutines (default: CPU*8)") scanCmd.Flags().BoolVar(&flagVerify, "verify", false, "actively verify found keys (opt-in, Phase 5)") scanCmd.Flags().BoolVar(&flagUnmask, "unmask", false, "show full key values (default: masked)") - scanCmd.Flags().StringVar(&flagOutput, "output", "table", "output format: table, json (more in Phase 6)") + scanCmd.Flags().StringVar(&flagOutput, "output", "table", "output format: table, json (full JSON output in Phase 6)") scanCmd.Flags().StringSliceVar(&flagExclude, "exclude", nil, "glob patterns to exclude (e.g. *.min.js)") viper.BindPFlag("scan.workers", scanCmd.Flags().Lookup("workers")) } @@ -658,6 +782,77 @@ func init() { configCmd.AddCommand(configSetCmd) configCmd.AddCommand(configGetCmd) } +``` + +Create **cmd/stubs.go** — stub commands for the 8 phases not yet implemented. +These satisfy CLI-01 (11 commands) and print a clear "not implemented" message so users +know the command exists but is pending a future phase. + +```go +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +// notImplemented returns a RunE function that prints a "not yet implemented" message. +// Each stub command is registered in root.go and satisfies CLI-01's 11-command requirement. +func notImplemented(name, phase string) func(cmd *cobra.Command, args []string) error { + return func(cmd *cobra.Command, args []string) error { + fmt.Printf("%s: not implemented in this phase (coming in %s)\n", name, phase) + return nil + } +} + +var verifyCmd = &cobra.Command{ + Use: "verify", + Short: "Actively verify found API keys (Phase 5)", + RunE: notImplemented("verify", "Phase 5"), +} + +var importCmd = &cobra.Command{ + Use: "import", + Short: "Import findings from TruffleHog or Gitleaks output (Phase 7)", + RunE: notImplemented("import", "Phase 7"), +} + +var reconCmd = &cobra.Command{ + Use: "recon", + Short: "Run OSINT recon across internet sources (Phase 9+)", + RunE: notImplemented("recon", "Phase 9"), +} + +var keysCmd = &cobra.Command{ + Use: "keys", + Short: "Manage stored keys (list, export, delete) (Phase 6)", + RunE: notImplemented("keys", "Phase 6"), +} + +var serveCmd = &cobra.Command{ + Use: "serve", + Short: "Start the web dashboard (Phase 18)", + RunE: notImplemented("serve", "Phase 18"), +} + +var dorksCmd = &cobra.Command{ + Use: "dorks", + Short: "Manage and run dork queries (Phase 8)", + RunE: notImplemented("dorks", "Phase 8"), +} + +var hookCmd = &cobra.Command{ + Use: "hook", + Short: "Install or manage git pre-commit hooks (Phase 7)", + RunE: notImplemented("hook", "Phase 7"), +} + +var scheduleCmd = &cobra.Command{ + Use: "schedule", + Short: "Manage scheduled recurring scans (Phase 17)", + RunE: notImplemented("schedule", "Phase 17"), +} ``` @@ -665,26 +860,32 @@ func init() { - `go build -o keyhunter .` exits 0 - - `./keyhunter --help` shows scan, providers, config commands + - `./keyhunter --help` shows all 11 commands: scan, verify, import, recon, keys, serve, dorks, hook, schedule, providers, config - `./keyhunter providers list` prints table with >= 3 rows including "openai" - `./keyhunter providers info openai` prints Name, Tier, Keywords, Patterns, Verify URL - `./keyhunter providers stats` prints "Total providers: 3" or more - `./keyhunter config init` creates or updates ~/.keyhunter.yaml - `./keyhunter config set scan.workers 16` exits 0 + - `./keyhunter verify` prints "not implemented in this phase" + - `./keyhunter recon` prints "not implemented in this phase" - `./keyhunter scan testdata/samples/openai_key.txt` exits 1 (keys found) and prints a table row with "openai" - `./keyhunter scan testdata/samples/no_keys.txt` exits 0 and prints "No API keys found." + - `./keyhunter scan --output json testdata/samples/no_keys.txt` exits 0 and prints `[]` (valid JSON) + - Second run of `./keyhunter scan testdata/samples/openai_key.txt` uses the SAME salt (loaded from settings table) - `grep -q 'viper\.BindPFlag' cmd/scan.go` exits 0 + - `grep -q 'loadOrCreateEncKey' cmd/scan.go` exits 0 - Full CLI works: scan finds and persists keys, providers list/info/stats work, config init/set/get work. Phase 1 success criteria all met. + Full CLI works: scan finds and persists keys with per-installation salt, providers list/info/stats work, config init/set/get work, 8 stub commands registered and respond. Phase 1 success criteria all met. Complete Phase 1 implementation: - Provider registry with 3 YAML definitions, Aho-Corasick automaton, schema validation -- Storage layer with AES-256-GCM encryption, Argon2id key derivation, SQLite WAL mode +- Storage layer with AES-256-GCM encryption, Argon2id key derivation, SQLite WAL mode, per-installation salt - Three-stage scan engine: keyword pre-filter → regex + entropy detector → finding channel - CLI: keyhunter scan, providers list/info/stats, config init/set/get +- 8 stub commands for future phases (verify, import, recon, keys, serve, dorks, hook, schedule) Run these commands from the project root and confirm each expected output: @@ -698,23 +899,29 @@ Run these commands from the project root and confirm each expected output: 3. `./keyhunter scan testdata/samples/no_keys.txt` Expected: Exit code 0, "No API keys found." printed -4. `./keyhunter providers list` +4. `./keyhunter scan --output json testdata/samples/no_keys.txt` + Expected: Exit code 0, valid JSON printed: `[]` + +5. `./keyhunter providers list` Expected: Table with openai, anthropic, huggingface rows -5. `./keyhunter providers info openai` +6. `./keyhunter providers info openai` Expected: Name, Tier 1, Keywords including "sk-proj-", Pattern regex shown -6. `./keyhunter config init` +7. `./keyhunter config init` Expected: "Config initialized: ~/.keyhunter.yaml" and the file exists -7. `./keyhunter config set scan.workers 16 && ./keyhunter config get scan.workers` +8. `./keyhunter config set scan.workers 16 && ./keyhunter config get scan.workers` Expected: "Set scan.workers = 16" then "16" -8. Build the binary with production flags: - `CGO_ENABLED=0 go build -ldflags="-s -w" -o keyhunter-prod .` - Expected: Builds without error, binary produced +9. `./keyhunter verify` + Expected: "verify: not implemented in this phase (coming in Phase 5)" + +10. Build the binary with production flags: + `CGO_ENABLED=0 go build -ldflags="-s -w" -o keyhunter-prod .` + Expected: Builds without error, binary produced - Type "approved" if all 8 checks pass, or describe which check failed and what output you saw. + Type "approved" if all 10 checks pass, or describe which check failed and what output you saw. @@ -724,20 +931,24 @@ Full Phase 1 integration check: - `go test ./... -count=1` exits 0 - `./keyhunter scan testdata/samples/openai_key.txt` exits 1 with findings table - `./keyhunter scan testdata/samples/no_keys.txt` exits 0 with "No API keys found." +- `./keyhunter scan --output json testdata/samples/no_keys.txt` prints valid JSON `[]` - `./keyhunter providers list` shows 3+ providers - `./keyhunter config init` creates ~/.keyhunter.yaml +- `./keyhunter verify` prints "not implemented in this phase" - `CGO_ENABLED=0 go build -ldflags="-s -w" -o keyhunter-prod .` exits 0 -- Cobra CLI with scan, providers, config commands (CLI-01) +- Cobra CLI with all 11 commands: scan, verify, import, recon, keys, serve, dorks, hook, schedule, providers, config (CLI-01) - `keyhunter config init` creates ~/.keyhunter.yaml (CLI-02) - `keyhunter config set key value` persists (CLI-03) - `keyhunter providers list/info/stats` work (CLI-04) - scan flags: --workers, --verify, --unmask, --output, --exclude (CLI-05) +- Per-installation salt stored in settings table; no hardcoded salt in production code +- JSON output returns valid JSON (not a comment string) - All Phase 1 success criteria from ROADMAP.md satisfied: 1. `keyhunter scan ./somefile` runs three-stage pipeline and returns findings with provider names - 2. Findings persisted to SQLite with AES-256 encrypted key_value + 2. Findings persisted to SQLite with AES-256 encrypted key_value; raw db does not contain plaintext 3. `keyhunter config init` and `config set` work 4. `keyhunter providers list/info` return provider metadata from YAML 5. Provider YAML has format_version and last_verified, validated at load time