fix(01-foundation): address all checker blockers and warnings in phase plans
This commit is contained in:
@@ -5,7 +5,7 @@ type: execute
|
||||
wave: 2
|
||||
depends_on: [01-02]
|
||||
files_modified:
|
||||
- pkg/engine/chunk.go
|
||||
- pkg/types/chunk.go
|
||||
- pkg/engine/finding.go
|
||||
- pkg/engine/entropy.go
|
||||
- pkg/engine/filter.go
|
||||
@@ -15,7 +15,7 @@ files_modified:
|
||||
- pkg/engine/sources/file.go
|
||||
- pkg/engine/scanner_test.go
|
||||
autonomous: true
|
||||
requirements: [CORE-01, CORE-04, CORE-05, CORE-06, CORE-07]
|
||||
requirements: [CORE-01, CORE-04, CORE-05, CORE-06]
|
||||
|
||||
must_haves:
|
||||
truths:
|
||||
@@ -26,8 +26,8 @@ must_haves:
|
||||
- "Full scan pipeline: scan testdata/samples/no_keys.txt → zero findings"
|
||||
- "Worker pool uses ants v2 with configurable worker count"
|
||||
artifacts:
|
||||
- path: "pkg/engine/chunk.go"
|
||||
provides: "Chunk struct (Data []byte, Source string, Offset int64)"
|
||||
- path: "pkg/types/chunk.go"
|
||||
provides: "Chunk struct (Data []byte, Source string, Offset int64) — shared by engine and sources packages"
|
||||
exports: ["Chunk"]
|
||||
- path: "pkg/engine/finding.go"
|
||||
provides: "Finding struct (provider, key value, masked, confidence, source, line)"
|
||||
@@ -40,12 +40,12 @@ must_haves:
|
||||
exports: ["KeywordFilter"]
|
||||
- path: "pkg/engine/detector.go"
|
||||
provides: "Detector stage — applies provider regexps and entropy check to chunks"
|
||||
exports: ["Detector"]
|
||||
exports: ["Detect"]
|
||||
- path: "pkg/engine/engine.go"
|
||||
provides: "Engine struct with Scan(ctx, src, cfg) <-chan Finding"
|
||||
exports: ["Engine", "NewEngine", "ScanConfig"]
|
||||
- path: "pkg/engine/sources/source.go"
|
||||
provides: "Source interface with Chunks(ctx, chan<- Chunk) error"
|
||||
provides: "Source interface with Chunks(ctx, chan<- types.Chunk) error"
|
||||
exports: ["Source"]
|
||||
- path: "pkg/engine/sources/file.go"
|
||||
provides: "FileSource implementing Source for single-file scanning"
|
||||
@@ -67,13 +67,19 @@ must_haves:
|
||||
to: "github.com/panjf2000/ants/v2"
|
||||
via: "ants.NewPool for detector workers"
|
||||
pattern: "ants\\.NewPool"
|
||||
- from: "pkg/engine/sources/source.go"
|
||||
to: "pkg/types/chunk.go"
|
||||
via: "Source interface uses types.Chunk — avoids circular import with pkg/engine"
|
||||
pattern: "types\\.Chunk"
|
||||
---
|
||||
|
||||
<objective>
|
||||
Build the three-stage scanning engine pipeline: Aho-Corasick keyword pre-filter, regex + entropy detector workers using ants goroutine pool, and a FileSource adapter. Wire them together in an Engine that emits Findings on a channel.
|
||||
|
||||
Purpose: The scan engine is the core differentiator. Plans 02 and 03 provide its dependencies (Registry for patterns + keywords, storage types for Finding). The CLI (Plan 05) calls Engine.Scan() to implement `keyhunter scan`.
|
||||
Output: pkg/engine/{chunk,finding,entropy,filter,detector,engine}.go and sources/{source,file}.go. scanner_test.go stubs filled.
|
||||
Output: pkg/types/chunk.go, pkg/engine/{finding,entropy,filter,detector,engine}.go and sources/{source,file}.go. scanner_test.go stubs filled.
|
||||
|
||||
NOTE on CORE-07 (mmap large file reading): FileSource uses os.ReadFile() in Phase 1, which is sufficient for the test fixtures. mmap-based reading for files > 10MB is deferred to Phase 4 (Input Sources) where it belongs architecturally alongside all other source adapter work.
|
||||
</objective>
|
||||
|
||||
<execution_context>
|
||||
@@ -86,6 +92,16 @@ Output: pkg/engine/{chunk,finding,entropy,filter,detector,engine}.go and sources
|
||||
@.planning/phases/01-foundation/01-02-SUMMARY.md
|
||||
|
||||
<interfaces>
|
||||
<!-- IMPORTANT: Circular import prevention -->
|
||||
The sources sub-package (pkg/engine/sources) needs the Chunk type.
|
||||
If Chunk were defined in pkg/engine, then sources would import engine, and engine imports
|
||||
sources (for the Source interface) — a circular import. Go will refuse to compile.
|
||||
|
||||
Resolution: Define Chunk in pkg/types (a shared, import-free package):
|
||||
pkg/types/chunk.go — defines types.Chunk
|
||||
pkg/engine/sources — imports pkg/types (no circular dep)
|
||||
pkg/engine — imports pkg/types and pkg/engine/sources (no circular dep)
|
||||
|
||||
<!-- Provider Registry types (from Plan 02) -->
|
||||
package providers
|
||||
|
||||
@@ -107,9 +123,9 @@ func (r *Registry) List() []Provider
|
||||
func (r *Registry) AC() ahocorasick.AhoCorasick // pre-built Aho-Corasick
|
||||
|
||||
<!-- Three-stage pipeline pattern from RESEARCH.md Pattern 2 -->
|
||||
chunksChan chan Chunk (buffer: 1000)
|
||||
detectableChan chan Chunk (buffer: 500)
|
||||
resultsChan chan Finding (buffer: 100)
|
||||
chunksChan chan types.Chunk (buffer: 1000)
|
||||
detectableChan chan types.Chunk (buffer: 500)
|
||||
resultsChan chan Finding (buffer: 100)
|
||||
|
||||
Stage 1: Source.Chunks() → chunksChan (goroutine, closes chan on done)
|
||||
Stage 2: KeywordFilter(chunksChan) → detectableChan (goroutine, AC.FindAll)
|
||||
@@ -124,7 +140,7 @@ type ScanConfig struct {
|
||||
|
||||
<!-- Source interface -->
|
||||
type Source interface {
|
||||
Chunks(ctx context.Context, out chan<- Chunk) error
|
||||
Chunks(ctx context.Context, out chan<- types.Chunk) error
|
||||
}
|
||||
|
||||
<!-- FileSource -->
|
||||
@@ -151,8 +167,8 @@ import "github.com/panjf2000/ants/v2"
|
||||
<tasks>
|
||||
|
||||
<task type="auto" tdd="true">
|
||||
<name>Task 1: Core types and Shannon entropy function</name>
|
||||
<files>pkg/engine/chunk.go, pkg/engine/finding.go, pkg/engine/entropy.go</files>
|
||||
<name>Task 1: Shared types package, Finding, and Shannon entropy function</name>
|
||||
<files>pkg/types/chunk.go, pkg/engine/finding.go, pkg/engine/entropy.go</files>
|
||||
<read_first>
|
||||
- /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (CORE-04 row: Shannon entropy, ~10-line stdlib function, threshold 3.5 bits/char)
|
||||
- /home/salva/Documents/apikey/pkg/storage/findings.go (Finding and MaskKey defined there — engine.Finding is a separate type for the pipeline)
|
||||
@@ -166,11 +182,13 @@ import "github.com/panjf2000/ants/v2"
|
||||
- Test 6: MaskKey("abc") → "****" (too short to mask)
|
||||
</behavior>
|
||||
<action>
|
||||
Create **pkg/engine/chunk.go**:
|
||||
Create **pkg/types/chunk.go** — the shared type that breaks the circular import:
|
||||
```go
|
||||
package engine
|
||||
package types
|
||||
|
||||
// Chunk is a segment of file content passed through the scanning pipeline.
|
||||
// Defined in pkg/types (not pkg/engine) so that pkg/engine/sources can use it
|
||||
// without creating a circular import with pkg/engine.
|
||||
type Chunk struct {
|
||||
Data []byte // raw bytes
|
||||
Source string // file path, URL, or description
|
||||
@@ -236,18 +254,18 @@ func Shannon(s string) float64 {
|
||||
```
|
||||
</action>
|
||||
<verify>
|
||||
<automated>cd /home/salva/Documents/apikey && go build ./pkg/engine/... && echo "BUILD OK"</automated>
|
||||
<automated>cd /home/salva/Documents/apikey && go build ./pkg/types/... && go build ./pkg/engine/... && echo "BUILD OK"</automated>
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- `go build ./pkg/types/...` exits 0
|
||||
- `go build ./pkg/engine/...` exits 0
|
||||
- pkg/engine/chunk.go exports Chunk with fields Data, Source, Offset
|
||||
- pkg/types/chunk.go exports Chunk with fields Data, Source, Offset
|
||||
- pkg/engine/finding.go exports Finding and MaskKey
|
||||
- pkg/engine/entropy.go exports Shannon using math.Log2
|
||||
- `grep -q 'math\.Log2' pkg/engine/entropy.go` exits 0
|
||||
- Shannon("aaaaaaa") == 0.0 (manually verifiable from code)
|
||||
- MaskKey("sk-proj-abc1234") produces "sk-proj-...1234"
|
||||
</acceptance_criteria>
|
||||
<done>Chunk, Finding, MaskKey, and Shannon exist and compile. Shannon uses stdlib math only — no external library.</done>
|
||||
<done>pkg/types/Chunk exists (no imports, no circular dependency risk), Finding, MaskKey, and Shannon exist and compile.</done>
|
||||
</task>
|
||||
|
||||
<task type="auto" tdd="true">
|
||||
@@ -262,7 +280,8 @@ func Shannon(s string) float64 {
|
||||
</files>
|
||||
<read_first>
|
||||
- /home/salva/Documents/apikey/.planning/phases/01-foundation/01-RESEARCH.md (Pattern 2: Three-Stage Scanning Pipeline — exact channel-based code example)
|
||||
- /home/salva/Documents/apikey/pkg/engine/chunk.go
|
||||
- /home/salva/Documents/apikey/pkg/types/chunk.go
|
||||
- /home/salva/Documents/apikey/pkg/engine/chunk.go (if exists — use pkg/types/chunk.go instead)
|
||||
- /home/salva/Documents/apikey/pkg/engine/finding.go
|
||||
- /home/salva/Documents/apikey/pkg/engine/entropy.go
|
||||
- /home/salva/Documents/apikey/pkg/providers/registry.go (Registry.AC() and Registry.List() signatures)
|
||||
@@ -283,13 +302,15 @@ package sources
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// Source is the interface all input adapters must implement.
|
||||
// Chunks writes content segments to the out channel until the source is exhausted or ctx is cancelled.
|
||||
// NOTE: Source is defined in the sources sub-package (not pkg/engine) and uses pkg/types.Chunk
|
||||
// to avoid a circular import: engine → sources → engine.
|
||||
type Source interface {
|
||||
Chunks(ctx context.Context, out chan<- engine.Chunk) error
|
||||
Chunks(ctx context.Context, out chan<- types.Chunk) error
|
||||
}
|
||||
```
|
||||
|
||||
@@ -301,7 +322,7 @@ import (
|
||||
"context"
|
||||
"os"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
const defaultChunkSize = 4096
|
||||
@@ -319,7 +340,9 @@ func NewFileSource(path string) *FileSource {
|
||||
}
|
||||
|
||||
// Chunks reads the file in overlapping segments and sends each chunk to out.
|
||||
func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error {
|
||||
// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented
|
||||
// in Phase 4 (Input Sources) alongside all other source adapter enhancements.
|
||||
func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
||||
data, err := os.ReadFile(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -333,7 +356,7 @@ func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- engine.Chunk{Data: data, Source: f.Path, Offset: 0}:
|
||||
case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}:
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -344,7 +367,7 @@ func (f *FileSource) Chunks(ctx context.Context, out chan<- engine.Chunk) error
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
}
|
||||
chunk := engine.Chunk{
|
||||
chunk := types.Chunk{
|
||||
Data: data[start:end],
|
||||
Source: f.Path,
|
||||
Offset: offset,
|
||||
@@ -369,12 +392,13 @@ package engine
|
||||
|
||||
import (
|
||||
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// KeywordFilter filters a stream of chunks using an Aho-Corasick automaton.
|
||||
// Only chunks that contain at least one provider keyword are sent to out.
|
||||
// This is Stage 2 of the pipeline (runs after Source, before Detector).
|
||||
func KeywordFilter(ac ahocorasick.AhoCorasick, in <-chan Chunk, out chan<- Chunk) {
|
||||
func KeywordFilter(ac ahocorasick.AhoCorasick, in <-chan types.Chunk, out chan<- types.Chunk) {
|
||||
for chunk := range in {
|
||||
if len(ac.FindAll(string(chunk.Data))) > 0 {
|
||||
out <- chunk
|
||||
@@ -393,11 +417,12 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// Detector applies provider regex patterns and optional entropy checks to a chunk.
|
||||
// Detect applies provider regex patterns and optional entropy checks to a chunk.
|
||||
// It returns all findings from the chunk.
|
||||
func Detect(chunk Chunk, providerList []providers.Provider) []Finding {
|
||||
func Detect(chunk types.Chunk, providerList []providers.Provider) []Finding {
|
||||
var findings []Finding
|
||||
content := string(chunk.Data)
|
||||
|
||||
@@ -452,8 +477,9 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/panjf2000/ants/v2"
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/engine/sources"
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// ScanConfig controls scan execution parameters.
|
||||
@@ -482,9 +508,10 @@ func (e *Engine) Scan(ctx context.Context, src sources.Source, cfg ScanConfig) (
|
||||
workers = runtime.NumCPU() * 8
|
||||
}
|
||||
|
||||
chunksChan := make(chan Chunk, 1000)
|
||||
detectableChan := make(chan Chunk, 500)
|
||||
resultsChan := make(chan Finding, 100)
|
||||
// Declare channels on separate lines to ensure correct Go syntax.
|
||||
chunksChan := make(chan types.Chunk, 1000)
|
||||
detectableChan := make(chan types.Chunk, 500)
|
||||
resultsChan := make(chan Finding, 100)
|
||||
|
||||
// Stage 1: source → chunksChan
|
||||
go func() {
|
||||
@@ -517,7 +544,7 @@ func (e *Engine) Scan(ctx context.Context, src sources.Source, cfg ScanConfig) (
|
||||
}()
|
||||
|
||||
for chunk := range detectableChan {
|
||||
c := chunk // capture
|
||||
c := chunk // capture loop variable
|
||||
wg.Add(1)
|
||||
_ = pool.Submit(func() {
|
||||
defer wg.Done()
|
||||
@@ -645,6 +672,7 @@ func TestScannerPipelineMultipleKeys(t *testing.T) {
|
||||
</verify>
|
||||
<acceptance_criteria>
|
||||
- `go test ./pkg/engine/... -v -count=1` exits 0 with all tests PASS (no SKIP)
|
||||
- `go build ./...` exits 0 with no circular import errors
|
||||
- TestShannonEntropy passes — 0.0 for "aaaaaaa", >= 3.5 for real key pattern
|
||||
- TestKeywordPreFilter passes — AC matches sk-proj-, empty for "hello world"
|
||||
- TestScannerPipelineOpenAI passes — 1 finding with ProviderName=="openai"
|
||||
@@ -652,19 +680,20 @@ func TestScannerPipelineMultipleKeys(t *testing.T) {
|
||||
- TestScannerPipelineMultipleKeys passes — >= 2 findings with both provider names
|
||||
- `grep -q 'ants\.NewPool' pkg/engine/engine.go` exits 0
|
||||
- `grep -q 'KeywordFilter' pkg/engine/engine.go` exits 0
|
||||
- `go build ./...` still exits 0
|
||||
- pkg/types/chunk.go exists and pkg/engine/sources imports pkg/types (not pkg/engine)
|
||||
</acceptance_criteria>
|
||||
<done>Three-stage scanning pipeline works end-to-end: FileSource → KeywordFilter (AC) → Detect (regex + entropy) → Finding channel. All engine tests pass.</done>
|
||||
<done>Three-stage scanning pipeline works end-to-end: FileSource → KeywordFilter (AC) → Detect (regex + entropy) → Finding channel. Circular import resolved via pkg/types. All engine tests pass.</done>
|
||||
</task>
|
||||
|
||||
</tasks>
|
||||
|
||||
<verification>
|
||||
After both tasks:
|
||||
- `go test ./pkg/engine/... -v -count=1` exits 0 with 6 tests PASS
|
||||
- `go build ./...` exits 0
|
||||
- `go build ./...` exits 0 with zero circular import errors
|
||||
- `go test ./pkg/engine/... -v -count=1` exits 0 with all tests PASS
|
||||
- `grep -q 'ants\.NewPool' pkg/engine/engine.go` exits 0
|
||||
- `grep -q 'math\.Log2' pkg/engine/entropy.go` exits 0
|
||||
- `grep -rq 'pkg/types' pkg/engine/sources/source.go` exits 0 (sources imports types, not engine)
|
||||
- Scanning testdata/samples/openai_key.txt returns 1 finding with provider "openai"
|
||||
- Scanning testdata/samples/no_keys.txt returns 0 findings
|
||||
</verification>
|
||||
@@ -673,7 +702,8 @@ After both tasks:
|
||||
- Three-stage pipeline: AC pre-filter → regex + entropy detector → results channel (CORE-01, CORE-06)
|
||||
- Shannon entropy function using stdlib math (CORE-04)
|
||||
- ants v2 goroutine pool with configurable worker count (CORE-05)
|
||||
- FileSource adapter reading files in overlapping chunks (CORE-07 partial — full mmap in Phase 4)
|
||||
- FileSource adapter reading files in overlapping chunks using os.ReadFile (mmap deferred to Phase 4)
|
||||
- pkg/types/Chunk breaks the engine↔sources circular import
|
||||
- All engine tests pass against real testdata fixtures
|
||||
</success_criteria>
|
||||
|
||||
|
||||
Reference in New Issue
Block a user