feat(01-04): implement three-stage scanning pipeline with ants worker pool
- pkg/engine/sources/source.go: Source interface using pkg/types.Chunk
- pkg/engine/sources/file.go: FileSource with overlapping chunk reads
- pkg/engine/filter.go: KeywordFilter using Aho-Corasick pre-filter
- pkg/engine/detector.go: Detect with regex matching + Shannon entropy check
- pkg/engine/engine.go: Engine.Scan orchestrating 3-stage pipeline with ants pool
- pkg/engine/scanner_test.go: filled test stubs with pipeline integration tests
- testdata/samples: fixed anthropic key lengths to match {93,} regex pattern
This commit is contained in:
55
pkg/engine/detector.go
Normal file
55
pkg/engine/detector.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package engine
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/providers"
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// Detect applies provider regex patterns and optional entropy checks to a chunk.
|
||||
// It returns all findings from the chunk.
|
||||
func Detect(chunk types.Chunk, providerList []providers.Provider) []Finding {
|
||||
var findings []Finding
|
||||
content := string(chunk.Data)
|
||||
|
||||
for _, p := range providerList {
|
||||
for _, pat := range p.Patterns {
|
||||
re, err := regexp.Compile(pat.Regex)
|
||||
if err != nil {
|
||||
continue // invalid regex -- skip silently
|
||||
}
|
||||
matches := re.FindAllString(content, -1)
|
||||
for _, match := range matches {
|
||||
// Apply entropy check if threshold is set
|
||||
if pat.EntropyMin > 0 && Shannon(match) < pat.EntropyMin {
|
||||
continue // too low entropy -- likely a placeholder
|
||||
}
|
||||
line := lineNumber(content, match)
|
||||
findings = append(findings, Finding{
|
||||
ProviderName: p.Name,
|
||||
KeyValue: match,
|
||||
KeyMasked: MaskKey(match),
|
||||
Confidence: pat.Confidence,
|
||||
Source: chunk.Source,
|
||||
SourceType: "file",
|
||||
LineNumber: line,
|
||||
Offset: chunk.Offset,
|
||||
DetectedAt: time.Now(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
return findings
|
||||
}
|
||||
|
||||
// lineNumber returns the 1-based line number where match first appears in content.
|
||||
func lineNumber(content, match string) int {
|
||||
idx := strings.Index(content, match)
|
||||
if idx < 0 {
|
||||
return 0
|
||||
}
|
||||
return strings.Count(content[:idx], "\n") + 1
|
||||
}
|
||||
Reference in New Issue
Block a user