package sources import ( "bytes" "context" "errors" "fmt" "io/fs" "os" "path/filepath" "sort" "strings" "golang.org/x/exp/mmap" "github.com/salvacybersec/keyhunter/pkg/types" ) // MmapThreshold is the file size above which DirSource/FileSource use memory-mapped reads. const MmapThreshold int64 = 10 * 1024 * 1024 // 10 MB // BinarySniffSize is the number of leading bytes inspected for a NUL byte // to classify a file as binary and skip it. const BinarySniffSize = 512 // DefaultExcludes are glob patterns excluded from directory scans unless // the caller passes an empty slice explicitly via NewDirSourceRaw. var DefaultExcludes = []string{ ".git/**", "node_modules/**", "vendor/**", "*.min.js", "*.map", } // DirSource walks a directory recursively and emits Chunks for every // non-excluded, non-binary file it finds. Files larger than MmapThreshold // are read via mmap; smaller files use os.ReadFile. type DirSource struct { Root string Excludes []string // glob patterns applied to path basename AND full relative path ChunkSize int } // NewDirSource creates a DirSource with the default exclusions merged // with the caller-supplied extras. func NewDirSource(root string, extraExcludes ...string) *DirSource { merged := make([]string, 0, len(DefaultExcludes)+len(extraExcludes)) merged = append(merged, DefaultExcludes...) merged = append(merged, extraExcludes...) return &DirSource{Root: root, Excludes: merged, ChunkSize: defaultChunkSize} } // NewDirSourceRaw creates a DirSource with ONLY the caller-supplied excludes // (no defaults). Useful for tests and advanced users. func NewDirSourceRaw(root string, excludes []string) *DirSource { return &DirSource{Root: root, Excludes: excludes, ChunkSize: defaultChunkSize} } // Chunks implements Source. It walks d.Root, filters excluded and binary // files, reads each remaining file (via mmap above MmapThreshold), and // emits overlapping chunks through out. func (d *DirSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { if d.Root == "" { return errors.New("DirSource: Root is empty") } info, err := os.Stat(d.Root) if err != nil { return fmt.Errorf("DirSource: stat root: %w", err) } if !info.IsDir() { return fmt.Errorf("DirSource: root %q is not a directory", d.Root) } // Collect paths first for deterministic ordering across runs. var paths []string err = filepath.WalkDir(d.Root, func(path string, de fs.DirEntry, werr error) error { if werr != nil { return werr } if de.IsDir() { if path == d.Root { return nil } rel, _ := filepath.Rel(d.Root, path) if d.isExcluded(rel, de.Name()) { return filepath.SkipDir } return nil } rel, _ := filepath.Rel(d.Root, path) if d.isExcluded(rel, de.Name()) { return nil } paths = append(paths, path) return nil }) if err != nil { return fmt.Errorf("DirSource: walk: %w", err) } sort.Strings(paths) for _, p := range paths { if err := ctx.Err(); err != nil { return err } if err := d.emitFile(ctx, p, out); err != nil { // Per-file errors are non-fatal: continue walking, but respect ctx. if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return err } // Swallow per-file errors; the engine logs elsewhere. continue } } return nil } // isExcluded returns true if either the relative path or the basename matches // any configured glob pattern. func (d *DirSource) isExcluded(rel, base string) bool { rel = filepath.ToSlash(rel) for _, pat := range d.Excludes { pat = filepath.ToSlash(pat) // Match against basename. if ok, _ := filepath.Match(pat, base); ok { return true } // Match against full relative path. if ok, _ := filepath.Match(pat, rel); ok { return true } // `dir/**` style — naive prefix match against the leading segment. if strings.HasSuffix(pat, "/**") { prefix := strings.TrimSuffix(pat, "/**") if rel == prefix || strings.HasPrefix(rel, prefix+"/") || base == prefix { return true } } } return false } // emitFile reads a single file and pushes its chunks onto out. func (d *DirSource) emitFile(ctx context.Context, path string, out chan<- types.Chunk) error { fi, err := os.Stat(path) if err != nil { return err } size := fi.Size() if size == 0 { return nil } var data []byte if size >= MmapThreshold { ra, err := mmap.Open(path) if err != nil { return fmt.Errorf("mmap open %s: %w", path, err) } defer ra.Close() data = make([]byte, ra.Len()) if _, err := ra.ReadAt(data, 0); err != nil { return fmt.Errorf("mmap read %s: %w", path, err) } } else { data, err = os.ReadFile(path) if err != nil { return err } } if isBinary(data) { return nil } return emitChunks(ctx, data, path, d.ChunkSize, out) } // isBinary reports whether the leading BinarySniffSize bytes contain a NUL byte. func isBinary(data []byte) bool { n := len(data) if n > BinarySniffSize { n = BinarySniffSize } return bytes.IndexByte(data[:n], 0x00) >= 0 } // emitChunks is the shared overlapping-chunk emitter used by FileSource and DirSource. func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error { if chunkSize <= 0 { chunkSize = defaultChunkSize } if len(data) <= chunkSize { select { case <-ctx.Done(): return ctx.Err() case out <- types.Chunk{Data: data, Source: source, Offset: 0}: } return nil } var offset int64 for start := 0; start < len(data); start += chunkSize - chunkOverlap { end := start + chunkSize if end > len(data) { end = len(data) } select { case <-ctx.Done(): return ctx.Err() case out <- types.Chunk{Data: data[start:end], Source: source, Offset: offset}: } offset += int64(end - start) if end == len(data) { break } } return nil }