From 6f834c9c06d4ae1b5377a0ebc7edf6d4bee0b4f4 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 15:18:10 +0300 Subject: [PATCH] feat(04-02): implement DirSource with recursive walk, glob exclusion, and mmap - Add DirSource with filepath.WalkDir recursive traversal - Default exclusions for .git, node_modules, vendor, *.min.js, *.map - Binary file detection via NUL byte sniff (first 512 bytes) - mmap reads for files >= 10MB via golang.org/x/exp/mmap - Deterministic sorted emission order for reproducible tests - Refactor FileSource to share emitChunks/isBinary helpers and mmap large files --- pkg/engine/sources/dir.go | 218 +++++++++++++++++++++++++++++++++++++ pkg/engine/sources/file.go | 58 +++++----- 2 files changed, 243 insertions(+), 33 deletions(-) create mode 100644 pkg/engine/sources/dir.go diff --git a/pkg/engine/sources/dir.go b/pkg/engine/sources/dir.go new file mode 100644 index 0000000..c9e844c --- /dev/null +++ b/pkg/engine/sources/dir.go @@ -0,0 +1,218 @@ +package sources + +import ( + "bytes" + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" + + "golang.org/x/exp/mmap" + + "github.com/salvacybersec/keyhunter/pkg/types" +) + +// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads. +const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB + +// BinarySniffSize is the number of leading bytes inspected for a NUL byte +// to classify a file as binary and skip it. +const BinarySniffSize = 512 + +// DefaultExcludes are glob patterns excluded from directory scans unless +// the caller passes an empty slice explicitly via NewDirSourceRaw. +var DefaultExcludes = []string{ + ".git/**", + "node_modules/**", + "vendor/**", + "*.min.js", + "*.map", +} + +// DirSource walks a directory recursively and emits Chunks for every +// non-excluded, non-binary file it finds. Files larger than MmapThreshold +// are read via mmap; smaller files use os.ReadFile. +type DirSource struct { + Root string + Excludes []string // glob patterns applied to path basename AND full relative path + ChunkSize int +} + +// NewDirSource creates a DirSource with the default exclusions merged +// with the caller-supplied extras. +func NewDirSource(root string, extraExcludes ...string) *DirSource { + merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes)) + merged = append(merged, DefaultExcludes...) + merged = append(merged, extraExcludes...) + return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize} +} + +// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes +// (no defaults). Useful for tests and advanced users. +func NewDirSourceRaw(root string, excludes []string) *DirSource { + return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize} +} + +// Chunks implements Source. It walks d.Root, filters excluded and binary +// files, reads each remaining file (via mmap above MmapThreshold), and +// emits overlapping chunks through out. +func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { + if d.Root == "" { + return errors.New("DirSource: Root is empty") + } + info, err := os.Stat(d.Root) + if err != nil { + return fmt.Errorf("DirSource: stat root: %w", err) + } + if !info.IsDir() { + return fmt.Errorf("DirSource: root %q is not a directory", d.Root) + } + + // Collect paths first for deterministic ordering across runs. + var paths []string + err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error { + if werr != nil { + return werr + } + if de.IsDir() { + if path == d.Root { + return nil + } + rel, _ := filepath.Rel(d.Root, path) + if d.isExcluded(rel, de.Name()) { + return filepath.SkipDir + } + return nil + } + rel, _ := filepath.Rel(d.Root, path) + if d.isExcluded(rel, de.Name()) { + return nil + } + paths = append(paths, path) + return nil + }) + if err != nil { + return fmt.Errorf("DirSource: walk: %w", err) + } + sort.Strings(paths) + + for _, p := range paths { + if err := ctx.Err(); err != nil { + return err + } + if err := d.emitFile(ctx, p, out); err != nil { + // Per-file errors are non-fatal: continue walking, but respect ctx. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + // Swallow per-file errors; the engine logs elsewhere. + continue + } + } + return nil +} + +// isExcluded returns true if either the relative path or the basename matches +// any configured glob pattern. +func (d *DirSource) isExcluded(rel, base string) bool { + rel = filepath.ToSlash(rel) + for _, pat := range d.Excludes { + pat = filepath.ToSlash(pat) + // Match against basename. + if ok, _ := filepath.Match(pat, base); ok { + return true + } + // Match against full relative path. + if ok, _ := filepath.Match(pat, rel); ok { + return true + } + // `dir/**` style — naive prefix match against the leading segment. + if strings.HasSuffix(pat, "/**") { + prefix := strings.TrimSuffix(pat, "/**") + if rel == prefix || strings.HasPrefix(rel, prefix+"/") || base == prefix { + return true + } + } + } + return false +} + +// emitFile reads a single file and pushes its chunks onto out. +func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error { + fi, err := os.Stat(path) + if err != nil { + return err + } + size := fi.Size() + if size == 0 { + return nil + } + + var data []byte + if size >= MmapThreshold { + ra, err := mmap.Open(path) + if err != nil { + return fmt.Errorf("mmap open %s: %w", path, err) + } + defer ra.Close() + data = make([]byte, ra.Len()) + if _, err := ra.ReadAt(data, 0); err != nil { + return fmt.Errorf("mmap read %s: %w", path, err) + } + } else { + data, err = os.ReadFile(path) + if err != nil { + return err + } + } + + if isBinary(data) { + return nil + } + return emitChunks(ctx, data, path, d.ChunkSize, out) +} + +// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte. +func isBinary(data []byte) bool { + n := len(data) + if n > BinarySniffSize { + n = BinarySniffSize + } + return bytes.IndexByte(data[:n], 0x00) >= 0 +} + +// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource. +func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error { + if chunkSize <= 0 { + chunkSize = defaultChunkSize + } + if len(data) <= chunkSize { + select { + case <-ctx.Done(): + return ctx.Err() + case out <- types.Chunk{Data: data, Source: source, Offset: 0}: + } + return nil + } + var offset int64 + for start := 0; start < len(data); start += chunkSize - chunkOverlap { + end := start + chunkSize + if end > len(data) { + end = len(data) + } + select { + case <-ctx.Done(): + return ctx.Err() + case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}: + } + offset += int64(end - start) + if end == len(data) { + break + } + } + return nil +} diff --git a/pkg/engine/sources/file.go b/pkg/engine/sources/file.go index 2bd3498..af1ae29 100644 --- a/pkg/engine/sources/file.go +++ b/pkg/engine/sources/file.go @@ -4,6 +4,8 @@ import ( "context" "os" + "golang.org/x/exp/mmap" + "github.com/salvacybersec/keyhunter/pkg/types" ) @@ -11,6 +13,7 @@ const defaultChunkSize = 4096 const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries // FileSource reads a single file and emits overlapping chunks. +// For files >= MmapThreshold it uses golang.org/x/exp/mmap for zero-copy reads. type FileSource struct { Path string ChunkSize int @@ -22,47 +25,36 @@ func NewFileSource(path string) *FileSource { } // Chunks reads the file in overlapping segments and sends each chunk to out. -// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented -// in Phase 4 (Input Sources) alongside all other source adapter enhancements. +// Uses mmap for files >= MmapThreshold (10MB) and os.ReadFile for smaller files. +// Binary files (NUL byte in the first 512 bytes) are skipped. func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { - data, err := os.ReadFile(f.Path) + fi, err := os.Stat(f.Path) if err != nil { return err } - size := f.ChunkSize - if size <= 0 { - size = defaultChunkSize - } - if len(data) <= size { - // File fits in one chunk - select { - case <-ctx.Done(): - return ctx.Err() - case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}: - } + size := fi.Size() + if size == 0 { return nil } - // Emit overlapping chunks - var offset int64 - for start := 0; start < len(data); start += size - chunkOverlap { - end := start + size - if end > len(data) { - end = len(data) + var data []byte + if size >= MmapThreshold { + ra, err := mmap.Open(f.Path) + if err != nil { + return err } - chunk := types.Chunk{ - Data: data[start:end], - Source: f.Path, - Offset: offset, + defer ra.Close() + data = make([]byte, ra.Len()) + if _, err := ra.ReadAt(data, 0); err != nil { + return err } - select { - case <-ctx.Done(): - return ctx.Err() - case out <- chunk: - } - offset += int64(end - start) - if end == len(data) { - break + } else { + data, err = os.ReadFile(f.Path) + if err != nil { + return err } } - return nil + if isBinary(data) { + return nil + } + return emitChunks(ctx, data, f.Path, f.ChunkSize, out) }