- pkg/engine/sources/source.go: Source interface using pkg/types.Chunk
- pkg/engine/sources/file.go: FileSource with overlapping chunk reads
- pkg/engine/filter.go: KeywordFilter using Aho-Corasick pre-filter
- pkg/engine/detector.go: Detect with regex matching + Shannon entropy check
- pkg/engine/engine.go: Engine.Scan orchestrating 3-stage pipeline with ants pool
- pkg/engine/scanner_test.go: filled test stubs with pipeline integration tests
- testdata/samples: fixed anthropic key lengths to match {93,} regex pattern
69 lines
1.6 KiB
Go
69 lines
1.6 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/types"
|
|
)
|
|
|
|
const defaultChunkSize = 4096
|
|
const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries
|
|
|
|
// FileSource reads a single file and emits overlapping chunks.
|
|
type FileSource struct {
|
|
Path string
|
|
ChunkSize int
|
|
}
|
|
|
|
// NewFileSource creates a FileSource for the given path with the default chunk size.
|
|
func NewFileSource(path string) *FileSource {
|
|
return &FileSource{Path: path, ChunkSize: defaultChunkSize}
|
|
}
|
|
|
|
// Chunks reads the file in overlapping segments and sends each chunk to out.
|
|
// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented
|
|
// in Phase 4 (Input Sources) alongside all other source adapter enhancements.
|
|
func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
|
data, err := os.ReadFile(f.Path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
size := f.ChunkSize
|
|
if size <= 0 {
|
|
size = defaultChunkSize
|
|
}
|
|
if len(data) <= size {
|
|
// File fits in one chunk
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}:
|
|
}
|
|
return nil
|
|
}
|
|
// Emit overlapping chunks
|
|
var offset int64
|
|
for start := 0; start < len(data); start += size - chunkOverlap {
|
|
end := start + size
|
|
if end > len(data) {
|
|
end = len(data)
|
|
}
|
|
chunk := types.Chunk{
|
|
Data: data[start:end],
|
|
Source: f.Path,
|
|
Offset: offset,
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case out <- chunk:
|
|
}
|
|
offset += int64(end - start)
|
|
if end == len(data) {
|
|
break
|
|
}
|
|
}
|
|
return nil
|
|
}
|