package sources import ( "context" "os" "golang.org/x/exp/mmap" "github.com/salvacybersec/keyhunter/pkg/types" ) const defaultChunkSize = 4096 const chunkOverlap = 256 // overlap between chunks to avoid splitting keys at boundaries // FileSource reads a single file and emits overlapping chunks. // For files >= MmapThreshold it uses golang.org/x/exp/mmap for zero-copy reads. type FileSource struct { Path string ChunkSize int } // NewFileSource creates a FileSource for the given path with the default chunk size. func NewFileSource(path string) *FileSource { return &FileSource{Path: path, ChunkSize: defaultChunkSize} } // Chunks reads the file in overlapping segments and sends each chunk to out. // Uses mmap for files >= MmapThreshold (10MB) and os.ReadFile for smaller files. // Binary files (NUL byte in the first 512 bytes) are skipped. func (f *FileSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { fi, err := os.Stat(f.Path) if err != nil { return err } size := fi.Size() if size == 0 { return nil } var data []byte if size >= MmapThreshold { ra, err := mmap.Open(f.Path) if err != nil { return err } defer ra.Close() data = make([]byte, ra.Len()) if _, err := ra.ReadAt(data, 0); err != nil { return err } } else { data, err = os.ReadFile(f.Path) if err != nil { return err } } if isBinary(data) { return nil } return emitChunks(ctx, data, f.Path, f.ChunkSize, out) }