package sources import ( "bytes" "context" "errors" "fmt" "io" "time" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/object" "github.com/salvacybersec/keyhunter/pkg/types" ) // gitBinarySniffSize is the byte window scanned for null bytes when deciding // whether a blob looks binary. Local to this file until plan 04-02 introduces // a package-wide constant. const gitBinarySniffSize = 512 // GitSource scans the full history of a local git repository: every commit // on every branch and tag, deduplicating blob scans by OID. type GitSource struct { // RepoPath is the path to the local git repo (working tree or bare). RepoPath string // Since, if non-zero, excludes commits older than this timestamp // (using commit author date). Since time.Time // ChunkSize is the overlap-chunker size; zero uses defaultChunkSize. ChunkSize int } // NewGitSource creates a GitSource for the given repo path. func NewGitSource(repoPath string) *GitSource { return &GitSource{RepoPath: repoPath, ChunkSize: defaultChunkSize} } // Chunks walks every commit reachable from every branch, tag, and the // stash ref (if present), streaming each unique blob's content through // the shared chunk emitter. func (g *GitSource) Chunks(ctx context.Context, out chan<- types.Chunk) error { if g.RepoPath == "" { return errors.New("GitSource: RepoPath is empty") } repo, err := git.PlainOpen(g.RepoPath) if err != nil { return fmt.Errorf("GitSource: open %q: %w", g.RepoPath, err) } // Collect commit hashes to walk from every ref under refs/heads, refs/tags, refs/stash. seedCommits, err := collectSeedCommits(repo) if err != nil { return fmt.Errorf("GitSource: collect refs: %w", err) } if len(seedCommits) == 0 { return nil // empty repo is not an error } seenCommits := make(map[plumbing.Hash]struct{}) seenBlobs := make(map[plumbing.Hash]struct{}) for _, seed := range seedCommits { if err := ctx.Err(); err != nil { return err } iter, err := repo.Log(&git.LogOptions{From: seed, All: false}) if err != nil { continue } err = iter.ForEach(func(c *object.Commit) error { if ctxErr := ctx.Err(); ctxErr != nil { return ctxErr } if _, ok := seenCommits[c.Hash]; ok { return nil } seenCommits[c.Hash] = struct{}{} if !g.Since.IsZero() && c.Author.When.Before(g.Since) { return nil } return g.emitCommitBlobs(ctx, c, seenBlobs, out) }) iter.Close() if err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return err } // Swallow per-seed iterator errors; continue with other refs. } } return nil } // collectSeedCommits gathers commit hashes from all local branches, tags, // remote-tracking branches, and the stash ref — the union of which reaches // every commit worth scanning. func collectSeedCommits(repo *git.Repository) ([]plumbing.Hash, error) { var seeds []plumbing.Hash refs, err := repo.References() if err != nil { return nil, err } err = refs.ForEach(func(ref *plumbing.Reference) error { // Only care about direct refs — skip symbolic refs (HEAD). if ref.Type() != plumbing.HashReference { return nil } name := ref.Name() if !(name.IsBranch() || name.IsTag() || name == plumbing.ReferenceName("refs/stash") || name.IsRemote()) { return nil } hash := ref.Hash() // For annotated tags the ref points at a tag object; resolve to commit if possible. if name.IsTag() { if tag, err := repo.TagObject(hash); err == nil { if c, err := tag.Commit(); err == nil { hash = c.Hash } } } seeds = append(seeds, hash) return nil }) return seeds, err } // emitCommitBlobs walks the tree of a commit and emits every blob whose // OID has not already been scanned. func (g *GitSource) emitCommitBlobs(ctx context.Context, c *object.Commit, seenBlobs map[plumbing.Hash]struct{}, out chan<- types.Chunk) error { tree, err := c.Tree() if err != nil { return nil // skip unreadable tree } shortSHA := c.Hash.String()[:7] return tree.Files().ForEach(func(f *object.File) error { if err := ctx.Err(); err != nil { return err } if _, ok := seenBlobs[f.Hash]; ok { return nil } seenBlobs[f.Hash] = struct{}{} // Skip obviously-binary blobs via go-git's helper, then via our sniff. if isBin, _ := f.IsBinary(); isBin { return nil } reader, err := f.Reader() if err != nil { return nil } data, err := io.ReadAll(reader) _ = reader.Close() if err != nil { return nil } if len(data) == 0 { return nil } sniff := data if len(sniff) > gitBinarySniffSize { sniff = sniff[:gitBinarySniffSize] } if bytes.IndexByte(sniff, 0x00) >= 0 { return nil } source := fmt.Sprintf("git:%s:%s", shortSHA, f.Name) return emitGitChunks(ctx, data, source, g.ChunkSize, out) }) } // emitGitChunks sends overlapping chunks of data to out. Mirrors the // chunking logic used by FileSource so that git blobs are scanned with the // same boundary-overlap guarantees. Will be replaced by the shared // emitChunks helper once plan 04-02 lands. func emitGitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error { size := chunkSize if size <= 0 { size = defaultChunkSize } if len(data) <= size { select { case <-ctx.Done(): return ctx.Err() case out <- types.Chunk{Data: data, Source: source, Offset: 0}: } return nil } var offset int64 for start := 0; start < len(data); start += size - chunkOverlap { end := start + size if end > len(data) { end = len(data) } chunk := types.Chunk{ Data: data[start:end], Source: source, Offset: offset, } select { case <-ctx.Done(): return ctx.Err() case out <- chunk: } offset += int64(end - start) if end == len(data) { break } } return nil }