Files
salvacybersec 6f834c9c06 feat(04-02): implement DirSource with recursive walk, glob exclusion, and mmap
- Add DirSource with filepath.WalkDir recursive traversal
- Default exclusions for .git, node_modules, vendor, *.min.js, *.map
- Binary file detection via NUL byte sniff (first 512 bytes)
- mmap reads for files >= 10MB via golang.org/x/exp/mmap
- Deterministic sorted emission order for reproducible tests
- Refactor FileSource to share emitChunks/isBinary helpers and mmap large files
2026-04-05 15:18:10 +03:00

219 lines
5.7 KiB
Go

package sources
import (
"bytes"
"context"
"errors"
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"strings"
"golang.org/x/exp/mmap"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads.
const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB
// BinarySniffSize is the number of leading bytes inspected for a NUL byte
// to classify a file as binary and skip it.
const BinarySniffSize = 512
// DefaultExcludes are glob patterns excluded from directory scans unless
// the caller passes an empty slice explicitly via NewDirSourceRaw.
var DefaultExcludes = []string{
".git/**",
"node_modules/**",
"vendor/**",
"*.min.js",
"*.map",
}
// DirSource walks a directory recursively and emits Chunks for every
// non-excluded, non-binary file it finds. Files larger than MmapThreshold
// are read via mmap; smaller files use os.ReadFile.
type DirSource struct {
Root string
Excludes []string // glob patterns applied to path basename AND full relative path
ChunkSize int
}
// NewDirSource creates a DirSource with the default exclusions merged
// with the caller-supplied extras.
func NewDirSource(root string, extraExcludes ...string) *DirSource {
merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes))
merged = append(merged, DefaultExcludes...)
merged = append(merged, extraExcludes...)
return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize}
}
// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes
// (no defaults). Useful for tests and advanced users.
func NewDirSourceRaw(root string, excludes []string) *DirSource {
return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize}
}
// Chunks implements Source. It walks d.Root, filters excluded and binary
// files, reads each remaining file (via mmap above MmapThreshold), and
// emits overlapping chunks through out.
func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
if d.Root == "" {
return errors.New("DirSource: Root is empty")
}
info, err := os.Stat(d.Root)
if err != nil {
return fmt.Errorf("DirSource: stat root: %w", err)
}
if !info.IsDir() {
return fmt.Errorf("DirSource: root %q is not a directory", d.Root)
}
// Collect paths first for deterministic ordering across runs.
var paths []string
err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error {
if werr != nil {
return werr
}
if de.IsDir() {
if path == d.Root {
return nil
}
rel, _ := filepath.Rel(d.Root, path)
if d.isExcluded(rel, de.Name()) {
return filepath.SkipDir
}
return nil
}
rel, _ := filepath.Rel(d.Root, path)
if d.isExcluded(rel, de.Name()) {
return nil
}
paths = append(paths, path)
return nil
})
if err != nil {
return fmt.Errorf("DirSource: walk: %w", err)
}
sort.Strings(paths)
for _, p := range paths {
if err := ctx.Err(); err != nil {
return err
}
if err := d.emitFile(ctx, p, out); err != nil {
// Per-file errors are non-fatal: continue walking, but respect ctx.
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
// Swallow per-file errors; the engine logs elsewhere.
continue
}
}
return nil
}
// isExcluded returns true if either the relative path or the basename matches
// any configured glob pattern.
func (d *DirSource) isExcluded(rel, base string) bool {
rel = filepath.ToSlash(rel)
for _, pat := range d.Excludes {
pat = filepath.ToSlash(pat)
// Match against basename.
if ok, _ := filepath.Match(pat, base); ok {
return true
}
// Match against full relative path.
if ok, _ := filepath.Match(pat, rel); ok {
return true
}
// `dir/**` style — naive prefix match against the leading segment.
if strings.HasSuffix(pat, "/**") {
prefix := strings.TrimSuffix(pat, "/**")
if rel == prefix || strings.HasPrefix(rel, prefix+"/") || base == prefix {
return true
}
}
}
return false
}
// emitFile reads a single file and pushes its chunks onto out.
func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error {
fi, err := os.Stat(path)
if err != nil {
return err
}
size := fi.Size()
if size == 0 {
return nil
}
var data []byte
if size >= MmapThreshold {
ra, err := mmap.Open(path)
if err != nil {
return fmt.Errorf("mmap open %s: %w", path, err)
}
defer ra.Close()
data = make([]byte, ra.Len())
if _, err := ra.ReadAt(data, 0); err != nil {
return fmt.Errorf("mmap read %s: %w", path, err)
}
} else {
data, err = os.ReadFile(path)
if err != nil {
return err
}
}
if isBinary(data) {
return nil
}
return emitChunks(ctx, data, path, d.ChunkSize, out)
}
// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte.
func isBinary(data []byte) bool {
n := len(data)
if n > BinarySniffSize {
n = BinarySniffSize
}
return bytes.IndexByte(data[:n], 0x00) >= 0
}
// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource.
func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error {
if chunkSize <= 0 {
chunkSize = defaultChunkSize
}
if len(data) <= chunkSize {
select {
case <-ctx.Done():
return ctx.Err()
case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
}
return nil
}
var offset int64
for start := 0; start < len(data); start += chunkSize - chunkOverlap {
end := start + chunkSize
if end > len(data) {
end = len(data)
}
select {
case <-ctx.Done():
return ctx.Err()
case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}:
}
offset += int64(end - start)
if end == len(data) {
break
}
}
return nil
}