- Add DirSource with filepath.WalkDir recursive traversal - Default exclusions for .git, node_modules, vendor, *.min.js, *.map - Binary file detection via NUL byte sniff (first 512 bytes) - mmap reads for files >= 10MB via golang.org/x/exp/mmap - Deterministic sorted emission order for reproducible tests - Refactor FileSource to share emitChunks/isBinary helpers and mmap large files
61 lines
1.4 KiB
Go
61 lines
1.4 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
|
|
"golang.org/x/exp/mmap"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/types"
|
|
)
|
|
|
|
const defaultChunkSize = 4096
|
|
const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries
|
|
|
|
// FileSource reads a single file and emits overlapping chunks.
|
|
// For files >= MmapThreshold it uses golang.org/x/exp/mmap for zero-copy reads.
|
|
type FileSource struct {
|
|
Path string
|
|
ChunkSize int
|
|
}
|
|
|
|
// NewFileSource creates a FileSource for the given path with the default chunk size.
|
|
func NewFileSource(path string) *FileSource {
|
|
return &FileSource{Path: path, ChunkSize: defaultChunkSize}
|
|
}
|
|
|
|
// Chunks reads the file in overlapping segments and sends each chunk to out.
|
|
// Uses mmap for files >= MmapThreshold (10MB) and os.ReadFile for smaller files.
|
|
// Binary files (NUL byte in the first 512 bytes) are skipped.
|
|
func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
|
fi, err := os.Stat(f.Path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
size := fi.Size()
|
|
if size == 0 {
|
|
return nil
|
|
}
|
|
var data []byte
|
|
if size >= MmapThreshold {
|
|
ra, err := mmap.Open(f.Path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer ra.Close()
|
|
data = make([]byte, ra.Len())
|
|
if _, err := ra.ReadAt(data, 0); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
data, err = os.ReadFile(f.Path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if isBinary(data) {
|
|
return nil
|
|
}
|
|
return emitChunks(ctx, data, f.Path, f.ChunkSize, out)
|
|
}
|