feat(04-02): implement DirSource with recursive walk, glob exclusion, and mmap
- Add DirSource with filepath.WalkDir recursive traversal - Default exclusions for .git, node_modules, vendor, *.min.js, *.map - Binary file detection via NUL byte sniff (first 512 bytes) - mmap reads for files >= 10MB via golang.org/x/exp/mmap - Deterministic sorted emission order for reproducible tests - Refactor FileSource to share emitChunks/isBinary helpers and mmap large files
This commit is contained in:
@@ -4,6 +4,8 @@ import (
|
||||
"context"
|
||||
"os"
|
||||
|
||||
"golang.org/x/exp/mmap"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
@@ -11,6 +13,7 @@ const defaultChunkSize = 4096
|
||||
const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries
|
||||
|
||||
// FileSource reads a single file and emits overlapping chunks.
|
||||
// For files >= MmapThreshold it uses golang.org/x/exp/mmap for zero-copy reads.
|
||||
type FileSource struct {
|
||||
Path string
|
||||
ChunkSize int
|
||||
@@ -22,47 +25,36 @@ func NewFileSource(path string) *FileSource {
|
||||
}
|
||||
|
||||
// Chunks reads the file in overlapping segments and sends each chunk to out.
|
||||
// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented
|
||||
// in Phase 4 (Input Sources) alongside all other source adapter enhancements.
|
||||
// Uses mmap for files >= MmapThreshold (10MB) and os.ReadFile for smaller files.
|
||||
// Binary files (NUL byte in the first 512 bytes) are skipped.
|
||||
func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
||||
data, err := os.ReadFile(f.Path)
|
||||
fi, err := os.Stat(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
size := f.ChunkSize
|
||||
if size <= 0 {
|
||||
size = defaultChunkSize
|
||||
}
|
||||
if len(data) <= size {
|
||||
// File fits in one chunk
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}:
|
||||
}
|
||||
size := fi.Size()
|
||||
if size == 0 {
|
||||
return nil
|
||||
}
|
||||
// Emit overlapping chunks
|
||||
var offset int64
|
||||
for start := 0; start < len(data); start += size - chunkOverlap {
|
||||
end := start + size
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
var data []byte
|
||||
if size >= MmapThreshold {
|
||||
ra, err := mmap.Open(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
chunk := types.Chunk{
|
||||
Data: data[start:end],
|
||||
Source: f.Path,
|
||||
Offset: offset,
|
||||
defer ra.Close()
|
||||
data = make([]byte, ra.Len())
|
||||
if _, err := ra.ReadAt(data, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- chunk:
|
||||
}
|
||||
offset += int64(end - start)
|
||||
if end == len(data) {
|
||||
break
|
||||
} else {
|
||||
data, err = os.ReadFile(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
if isBinary(data) {
|
||||
return nil
|
||||
}
|
||||
return emitChunks(ctx, data, f.Path, f.ChunkSize, out)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user