feat(01-04): implement three-stage scanning pipeline with ants worker pool

- pkg/engine/sources/source.go: Source interface using pkg/types.Chunk
- pkg/engine/sources/file.go: FileSource with overlapping chunk reads
- pkg/engine/filter.go: KeywordFilter using Aho-Corasick pre-filter
- pkg/engine/detector.go: Detect with regex matching + Shannon entropy check
- pkg/engine/engine.go: Engine.Scan orchestrating 3-stage pipeline with ants pool
- pkg/engine/scanner_test.go: filled test stubs with pipeline integration tests
- testdata/samples: fixed anthropic key lengths to match {93,} regex pattern
This commit is contained in:
salvacybersec
2026-04-05 12:21:17 +03:00
parent 45cc676f55
commit cea2e371cc
8 changed files with 353 additions and 13 deletions

17
pkg/engine/filter.go Normal file
View File

@@ -0,0 +1,17 @@
package engine
import (
ahocorasick "github.com/petar-dambovaliev/aho-corasick"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// KeywordFilter filters a stream of chunks using an Aho-Corasick automaton.
// Only chunks that contain at least one provider keyword are sent to out.
// This is Stage 2 of the pipeline (runs after Source, before Detector).
func KeywordFilter(ac ahocorasick.AhoCorasick, in <-chan types.Chunk, out chan<- types.Chunk) {
for chunk := range in {
if len(ac.FindAll(string(chunk.Data))) > 0 {
out <- chunk
}
}
}