feat(04-03): implement GitSource with full-history traversal
- Walks every commit across branches, tags, remote-tracking refs, and stash - Deduplicates blob scans by OID (seenBlobs map) so identical content across commits/files is scanned exactly once - Emits chunks with source format git:<short-sha>:<path> - Honors --since filter via GitSource.Since (commit author date) - Resolves annotated tag objects down to their commit hash - Skips binary blobs via go-git IsBinary plus null-byte sniff - 8 subtests cover history walk, dedup, modified-file, multi-branch, tag reachability, since filter, source format, missing repo
This commit is contained in:
216
pkg/engine/sources/git.go
Normal file
216
pkg/engine/sources/git.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package sources
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/go-git/go-git/v5"
|
||||
"github.com/go-git/go-git/v5/plumbing"
|
||||
"github.com/go-git/go-git/v5/plumbing/object"
|
||||
|
||||
"github.com/salvacybersec/keyhunter/pkg/types"
|
||||
)
|
||||
|
||||
// gitBinarySniffSize is the byte window scanned for null bytes when deciding
|
||||
// whether a blob looks binary. Local to this file until plan 04-02 introduces
|
||||
// a package-wide constant.
|
||||
const gitBinarySniffSize = 512
|
||||
|
||||
// GitSource scans the full history of a local git repository: every commit
|
||||
// on every branch and tag, deduplicating blob scans by OID.
|
||||
type GitSource struct {
|
||||
// RepoPath is the path to the local git repo (working tree or bare).
|
||||
RepoPath string
|
||||
// Since, if non-zero, excludes commits older than this timestamp
|
||||
// (using commit author date).
|
||||
Since time.Time
|
||||
// ChunkSize is the overlap-chunker size; zero uses defaultChunkSize.
|
||||
ChunkSize int
|
||||
}
|
||||
|
||||
// NewGitSource creates a GitSource for the given repo path.
|
||||
func NewGitSource(repoPath string) *GitSource {
|
||||
return &GitSource{RepoPath: repoPath, ChunkSize: defaultChunkSize}
|
||||
}
|
||||
|
||||
// Chunks walks every commit reachable from every branch, tag, and the
|
||||
// stash ref (if present), streaming each unique blob's content through
|
||||
// the shared chunk emitter.
|
||||
func (g *GitSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
|
||||
if g.RepoPath == "" {
|
||||
return errors.New("GitSource: RepoPath is empty")
|
||||
}
|
||||
repo, err := git.PlainOpen(g.RepoPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("GitSource: open %q: %w", g.RepoPath, err)
|
||||
}
|
||||
|
||||
// Collect commit hashes to walk from every ref under refs/heads, refs/tags, refs/stash.
|
||||
seedCommits, err := collectSeedCommits(repo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("GitSource: collect refs: %w", err)
|
||||
}
|
||||
if len(seedCommits) == 0 {
|
||||
return nil // empty repo is not an error
|
||||
}
|
||||
|
||||
seenCommits := make(map[plumbing.Hash]struct{})
|
||||
seenBlobs := make(map[plumbing.Hash]struct{})
|
||||
|
||||
for _, seed := range seedCommits {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
iter, err := repo.Log(&git.LogOptions{From: seed, All: false})
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
err = iter.ForEach(func(c *object.Commit) error {
|
||||
if ctxErr := ctx.Err(); ctxErr != nil {
|
||||
return ctxErr
|
||||
}
|
||||
if _, ok := seenCommits[c.Hash]; ok {
|
||||
return nil
|
||||
}
|
||||
seenCommits[c.Hash] = struct{}{}
|
||||
|
||||
if !g.Since.IsZero() && c.Author.When.Before(g.Since) {
|
||||
return nil
|
||||
}
|
||||
return g.emitCommitBlobs(ctx, c, seenBlobs, out)
|
||||
})
|
||||
iter.Close()
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return err
|
||||
}
|
||||
// Swallow per-seed iterator errors; continue with other refs.
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectSeedCommits gathers commit hashes from all local branches, tags,
|
||||
// remote-tracking branches, and the stash ref — the union of which reaches
|
||||
// every commit worth scanning.
|
||||
func collectSeedCommits(repo *git.Repository) ([]plumbing.Hash, error) {
|
||||
var seeds []plumbing.Hash
|
||||
refs, err := repo.References()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
err = refs.ForEach(func(ref *plumbing.Reference) error {
|
||||
// Only care about direct refs — skip symbolic refs (HEAD).
|
||||
if ref.Type() != plumbing.HashReference {
|
||||
return nil
|
||||
}
|
||||
name := ref.Name()
|
||||
if !(name.IsBranch() || name.IsTag() || name == plumbing.ReferenceName("refs/stash") || name.IsRemote()) {
|
||||
return nil
|
||||
}
|
||||
hash := ref.Hash()
|
||||
// For annotated tags the ref points at a tag object; resolve to commit if possible.
|
||||
if name.IsTag() {
|
||||
if tag, err := repo.TagObject(hash); err == nil {
|
||||
if c, err := tag.Commit(); err == nil {
|
||||
hash = c.Hash
|
||||
}
|
||||
}
|
||||
}
|
||||
seeds = append(seeds, hash)
|
||||
return nil
|
||||
})
|
||||
return seeds, err
|
||||
}
|
||||
|
||||
// emitCommitBlobs walks the tree of a commit and emits every blob whose
|
||||
// OID has not already been scanned.
|
||||
func (g *GitSource) emitCommitBlobs(ctx context.Context, c *object.Commit, seenBlobs map[plumbing.Hash]struct{}, out chan<- types.Chunk) error {
|
||||
tree, err := c.Tree()
|
||||
if err != nil {
|
||||
return nil // skip unreadable tree
|
||||
}
|
||||
shortSHA := c.Hash.String()[:7]
|
||||
|
||||
return tree.Files().ForEach(func(f *object.File) error {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, ok := seenBlobs[f.Hash]; ok {
|
||||
return nil
|
||||
}
|
||||
seenBlobs[f.Hash] = struct{}{}
|
||||
|
||||
// Skip obviously-binary blobs via go-git's helper, then via our sniff.
|
||||
if isBin, _ := f.IsBinary(); isBin {
|
||||
return nil
|
||||
}
|
||||
reader, err := f.Reader()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
data, err := io.ReadAll(reader)
|
||||
_ = reader.Close()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
if len(data) == 0 {
|
||||
return nil
|
||||
}
|
||||
sniff := data
|
||||
if len(sniff) > gitBinarySniffSize {
|
||||
sniff = sniff[:gitBinarySniffSize]
|
||||
}
|
||||
if bytes.IndexByte(sniff, 0x00) >= 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
source := fmt.Sprintf("git:%s:%s", shortSHA, f.Name)
|
||||
return emitGitChunks(ctx, data, source, g.ChunkSize, out)
|
||||
})
|
||||
}
|
||||
|
||||
// emitGitChunks sends overlapping chunks of data to out. Mirrors the
|
||||
// chunking logic used by FileSource so that git blobs are scanned with the
|
||||
// same boundary-overlap guarantees. Will be replaced by the shared
|
||||
// emitChunks helper once plan 04-02 lands.
|
||||
func emitGitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error {
|
||||
size := chunkSize
|
||||
if size <= 0 {
|
||||
size = defaultChunkSize
|
||||
}
|
||||
if len(data) <= size {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- types.Chunk{Data: data, Source: source, Offset: 0}:
|
||||
}
|
||||
return nil
|
||||
}
|
||||
var offset int64
|
||||
for start := 0; start < len(data); start += size - chunkOverlap {
|
||||
end := start + size
|
||||
if end > len(data) {
|
||||
end = len(data)
|
||||
}
|
||||
chunk := types.Chunk{
|
||||
Data: data[start:end],
|
||||
Source: source,
|
||||
Offset: offset,
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case out <- chunk:
|
||||
}
|
||||
offset += int64(end - start)
|
||||
if end == len(data) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user