feat(04-02): implement DirSource with recursive walk, glob exclusion, and mmap
- Add DirSource with filepath.WalkDir recursive traversal - Default exclusions for .git, node_modules, vendor, *.min.js, *.map - Binary file detection via NUL byte sniff (first 512 bytes) - mmap reads for files >= 10MB via golang.org/x/exp/mmap - Deterministic sorted emission order for reproducible tests - Refactor FileSource to share emitChunks/isBinary helpers and mmap large files
This commit is contained in:
218
pkg/engine/sources/dir.go
Normal file
218
pkg/engine/sources/dir.go
Normal file
@@ -0,0 +1,218 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/mmap"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads.
|
||||
const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB
|
||||
|
||||
// BinarySniffSize is the number of leading bytes inspected for a NUL byte
|
||||
// to classify a file as binary and skip it.
|
||||
const BinarySniffSize = 512
|
||||
|
||||
// DefaultExcludes are glob patterns excluded from directory scans unless
|
||||
// the caller passes an empty slice explicitly via NewDirSourceRaw.
|
||||
var DefaultExcludes = []string{
|
||||
".git/**",
|
||||
"node_modules/**",
|
||||
"vendor/**",
|
||||
"*.min.js",
|
||||
"*.map",
|
||||
}
|
||||
|
||||
// DirSource walks a directory recursively and emits Chunks for every
|
||||
// non-excluded, non-binary file it finds. Files larger than MmapThreshold
|
||||
// are read via mmap; smaller files use os.ReadFile.
|
||||
type DirSource struct {
|
||||
Root string
|
||||
Excludes []string // glob patterns applied to path basename AND full relative path
|
||||
ChunkSize int
|
||||
}
|
||||
|
||||
// NewDirSource creates a DirSource with the default exclusions merged
|
||||
// with the caller-supplied extras.
|
||||
func NewDirSource(root string, extraExcludes ...string) *DirSource {
|
||||
merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes))
|
||||
merged = append(merged, DefaultExcludes...)
|
||||
merged = append(merged, extraExcludes...)
|
||||
return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize}
|
||||
}
|
||||
|
||||
// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes
|
||||
// (no defaults). Useful for tests and advanced users.
|
||||
func NewDirSourceRaw(root string, excludes []string) *DirSource {
|
||||
return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize}
|
||||
}
|
||||
|
||||
// Chunks implements Source. It walks d.Root, filters excluded and binary
|
||||
// files, reads each remaining file (via mmap above MmapThreshold), and
|
||||
// emits overlapping chunks through out.
|
||||
func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
||||
if d.Root == "" {
|
||||
return errors.New("DirSource: Root is empty")
|
||||
}
|
||||
info, err := os.Stat(d.Root)
|
||||
if err != nil {
|
||||
return fmt.Errorf("DirSource: stat root: %w", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return fmt.Errorf("DirSource: root %q is not a directory", d.Root)
|
||||
}
|
||||
|
||||
// Collect paths first for deterministic ordering across runs.
|
||||
var paths []string
|
||||
err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error {
|
||||
if werr != nil {
|
||||
return werr
|
||||
}
|
||||
if de.IsDir() {
|
||||
if path == d.Root {
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(d.Root, path)
|
||||
if d.isExcluded(rel, de.Name()) {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
rel, _ := filepath.Rel(d.Root, path)
|
||||
if d.isExcluded(rel, de.Name()) {
|
||||
return nil
|
||||
}
|
||||
paths = append(paths, path)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("DirSource: walk: %w", err)
|
||||
}
|
||||
sort.Strings(paths)
|
||||
|
||||
for _, p := range paths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := d.emitFile(ctx, p, out); err != nil {
|
||||
// Per-file errors are non-fatal: continue walking, but respect ctx.
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return err
|
||||
}
|
||||
// Swallow per-file errors; the engine logs elsewhere.
|
||||
continue
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isExcluded returns true if either the relative path or the basename matches
|
||||
// any configured glob pattern.
|
||||
func (d *DirSource) isExcluded(rel, base string) bool {
|
||||
rel = filepath.ToSlash(rel)
|
||||
for _, pat := range d.Excludes {
|
||||
pat = filepath.ToSlash(pat)
|
||||
// Match against basename.
|
||||
if ok, _ := filepath.Match(pat, base); ok {
|
||||
return true
|
||||
}
|
||||
// Match against full relative path.
|
||||
if ok, _ := filepath.Match(pat, rel); ok {
|
||||
return true
|
||||
}
|
||||
// `dir/**` style — naive prefix match against the leading segment.
|
||||
if strings.HasSuffix(pat, "/**") {
|
||||
prefix := strings.TrimSuffix(pat, "/**")
|
||||
if rel == prefix || strings.HasPrefix(rel, prefix+"/") || base == prefix {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// emitFile reads a single file and pushes its chunks onto out.
|
||||
func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error {
|
||||
fi, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
size := fi.Size()
|
||||
if size == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var data []byte
|
||||
if size >= MmapThreshold {
|
||||
ra, err := mmap.Open(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("mmap open %s: %w", path, err)
|
||||
}
|
||||
defer ra.Close()
|
||||
data = make([]byte, ra.Len())
|
||||
if _, err := ra.ReadAt(data, 0); err != nil {
|
||||
return fmt.Errorf("mmap read %s: %w", path, err)
|
||||
}
|
||||
} else {
|
||||
data, err = os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if isBinary(data) {
|
||||
return nil
|
||||
}
|
||||
return emitChunks(ctx, data, path, d.ChunkSize, out)
|
||||
}
|
||||
|
||||
// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte.
|
||||
func isBinary(data []byte) bool {
|
||||
n := len(data)
|
||||
if n > BinarySniffSize {
|
||||
n = BinarySniffSize
|
||||
}
|
||||
return bytes.IndexByte(data[:n], 0x00) >= 0
|
||||
}
|
||||
|
||||
// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource.
|
||||
func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error {
|
||||
if chunkSize <= 0 {
|
||||
chunkSize = defaultChunkSize
|
||||
}
|
||||
if len(data) <= chunkSize {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
|
||||
}
|
||||
return nil
|
||||
}
|
||||
var offset int64
|
||||
for start := 0; start < len(data); start += chunkSize - chunkOverlap {
|
||||
end := start + chunkSize
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}:
|
||||
}
|
||||
offset += int64(end - start)
|
||||
if end == len(data) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
"context"
|
||||
"os"
|
||||
|
||||
"golang.org/x/exp/mmap"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
@@ -11,6 +13,7 @@ const defaultChunkSize = 4096
|
||||
const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries
|
||||
|
||||
// FileSource reads a single file and emits overlapping chunks.
|
||||
// For files >= MmapThreshold it uses golang.org/x/exp/mmap for zero-copy reads.
|
||||
type FileSource struct {
|
||||
Path string
|
||||
ChunkSize int
|
||||
@@ -22,47 +25,36 @@ func NewFileSource(path string) *FileSource {
|
||||
}
|
||||
|
||||
// Chunks reads the file in overlapping segments and sends each chunk to out.
|
||||
// Uses os.ReadFile for simplicity in Phase 1. mmap for files > 10MB is implemented
|
||||
// in Phase 4 (Input Sources) alongside all other source adapter enhancements.
|
||||
// Uses mmap for files >= MmapThreshold (10MB) and os.ReadFile for smaller files.
|
||||
// Binary files (NUL byte in the first 512 bytes) are skipped.
|
||||
func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
||||
data, err := os.ReadFile(f.Path)
|
||||
fi, err := os.Stat(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
size := f.ChunkSize
|
||||
if size <= 0 {
|
||||
size = defaultChunkSize
|
||||
}
|
||||
if len(data) <= size {
|
||||
// File fits in one chunk
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- types.Chunk{Data: data, Source: f.Path, Offset: 0}:
|
||||
}
|
||||
size := fi.Size()
|
||||
if size == 0 {
|
||||
return nil
|
||||
}
|
||||
// Emit overlapping chunks
|
||||
var offset int64
|
||||
for start := 0; start < len(data); start += size - chunkOverlap {
|
||||
end := start + size
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
var data []byte
|
||||
if size >= MmapThreshold {
|
||||
ra, err := mmap.Open(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
chunk := types.Chunk{
|
||||
Data: data[start:end],
|
||||
Source: f.Path,
|
||||
Offset: offset,
|
||||
defer ra.Close()
|
||||
data = make([]byte, ra.Len())
|
||||
if _, err := ra.ReadAt(data, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- chunk:
|
||||
}
|
||||
offset += int64(end - start)
|
||||
if end == len(data) {
|
||||
break
|
||||
} else {
|
||||
data, err = os.ReadFile(f.Path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
if isBinary(data) {
|
||||
return nil
|
||||
}
|
||||
return emitChunks(ctx, data, f.Path, f.ChunkSize, out)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user