- Add DirSource with filepath.WalkDir recursive traversal - Default exclusions for .git, node_modules, vendor, *.min.js, *.map - Binary file detection via NUL byte sniff (first 512 bytes) - mmap reads for files >= 10MB via golang.org/x/exp/mmap - Deterministic sorted emission order for reproducible tests - Refactor FileSource to share emitChunks/isBinary helpers and mmap large files
219 lines
5.7 KiB
Go
219 lines
5.7 KiB
Go
package sources
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
|
|
"golang.org/x/exp/mmap"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/types"
|
|
)
|
|
|
|
// MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads.
|
|
const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB
|
|
|
|
// BinarySniffSize is the number of leading bytes inspected for a NUL byte
|
|
// to classify a file as binary and skip it.
|
|
const BinarySniffSize = 512
|
|
|
|
// DefaultExcludes are glob patterns excluded from directory scans unless
|
|
// the caller passes an empty slice explicitly via NewDirSourceRaw.
|
|
var DefaultExcludes = []string{
|
|
".git/**",
|
|
"node_modules/**",
|
|
"vendor/**",
|
|
"*.min.js",
|
|
"*.map",
|
|
}
|
|
|
|
// DirSource walks a directory recursively and emits Chunks for every
|
|
// non-excluded, non-binary file it finds. Files larger than MmapThreshold
|
|
// are read via mmap; smaller files use os.ReadFile.
|
|
type DirSource struct {
|
|
Root string
|
|
Excludes []string // glob patterns applied to path basename AND full relative path
|
|
ChunkSize int
|
|
}
|
|
|
|
// NewDirSource creates a DirSource with the default exclusions merged
|
|
// with the caller-supplied extras.
|
|
func NewDirSource(root string, extraExcludes ...string) *DirSource {
|
|
merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes))
|
|
merged = append(merged, DefaultExcludes...)
|
|
merged = append(merged, extraExcludes...)
|
|
return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize}
|
|
}
|
|
|
|
// NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes
|
|
// (no defaults). Useful for tests and advanced users.
|
|
func NewDirSourceRaw(root string, excludes []string) *DirSource {
|
|
return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize}
|
|
}
|
|
|
|
// Chunks implements Source. It walks d.Root, filters excluded and binary
|
|
// files, reads each remaining file (via mmap above MmapThreshold), and
|
|
// emits overlapping chunks through out.
|
|
func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
|
if d.Root == "" {
|
|
return errors.New("DirSource: Root is empty")
|
|
}
|
|
info, err := os.Stat(d.Root)
|
|
if err != nil {
|
|
return fmt.Errorf("DirSource: stat root: %w", err)
|
|
}
|
|
if !info.IsDir() {
|
|
return fmt.Errorf("DirSource: root %q is not a directory", d.Root)
|
|
}
|
|
|
|
// Collect paths first for deterministic ordering across runs.
|
|
var paths []string
|
|
err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error {
|
|
if werr != nil {
|
|
return werr
|
|
}
|
|
if de.IsDir() {
|
|
if path == d.Root {
|
|
return nil
|
|
}
|
|
rel, _ := filepath.Rel(d.Root, path)
|
|
if d.isExcluded(rel, de.Name()) {
|
|
return filepath.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
rel, _ := filepath.Rel(d.Root, path)
|
|
if d.isExcluded(rel, de.Name()) {
|
|
return nil
|
|
}
|
|
paths = append(paths, path)
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("DirSource: walk: %w", err)
|
|
}
|
|
sort.Strings(paths)
|
|
|
|
for _, p := range paths {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if err := d.emitFile(ctx, p, out); err != nil {
|
|
// Per-file errors are non-fatal: continue walking, but respect ctx.
|
|
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
|
return err
|
|
}
|
|
// Swallow per-file errors; the engine logs elsewhere.
|
|
continue
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// isExcluded returns true if either the relative path or the basename matches
|
|
// any configured glob pattern.
|
|
func (d *DirSource) isExcluded(rel, base string) bool {
|
|
rel = filepath.ToSlash(rel)
|
|
for _, pat := range d.Excludes {
|
|
pat = filepath.ToSlash(pat)
|
|
// Match against basename.
|
|
if ok, _ := filepath.Match(pat, base); ok {
|
|
return true
|
|
}
|
|
// Match against full relative path.
|
|
if ok, _ := filepath.Match(pat, rel); ok {
|
|
return true
|
|
}
|
|
// `dir/**` style — naive prefix match against the leading segment.
|
|
if strings.HasSuffix(pat, "/**") {
|
|
prefix := strings.TrimSuffix(pat, "/**")
|
|
if rel == prefix || strings.HasPrefix(rel, prefix+"/") || base == prefix {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// emitFile reads a single file and pushes its chunks onto out.
|
|
func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error {
|
|
fi, err := os.Stat(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
size := fi.Size()
|
|
if size == 0 {
|
|
return nil
|
|
}
|
|
|
|
var data []byte
|
|
if size >= MmapThreshold {
|
|
ra, err := mmap.Open(path)
|
|
if err != nil {
|
|
return fmt.Errorf("mmap open %s: %w", path, err)
|
|
}
|
|
defer ra.Close()
|
|
data = make([]byte, ra.Len())
|
|
if _, err := ra.ReadAt(data, 0); err != nil {
|
|
return fmt.Errorf("mmap read %s: %w", path, err)
|
|
}
|
|
} else {
|
|
data, err = os.ReadFile(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if isBinary(data) {
|
|
return nil
|
|
}
|
|
return emitChunks(ctx, data, path, d.ChunkSize, out)
|
|
}
|
|
|
|
// isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte.
|
|
func isBinary(data []byte) bool {
|
|
n := len(data)
|
|
if n > BinarySniffSize {
|
|
n = BinarySniffSize
|
|
}
|
|
return bytes.IndexByte(data[:n], 0x00) >= 0
|
|
}
|
|
|
|
// emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource.
|
|
func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error {
|
|
if chunkSize <= 0 {
|
|
chunkSize = defaultChunkSize
|
|
}
|
|
if len(data) <= chunkSize {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
|
|
}
|
|
return nil
|
|
}
|
|
var offset int64
|
|
for start := 0; start < len(data); start += chunkSize - chunkOverlap {
|
|
end := start + chunkSize
|
|
if end > len(data) {
|
|
end = len(data)
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}:
|
|
}
|
|
offset += int64(end - start)
|
|
if end == len(data) {
|
|
break
|
|
}
|
|
}
|
|
return nil
|
|
}
|