14 KiB
phase, plan, type, wave, depends_on, files_modified, autonomous, requirements, must_haves
| phase | plan | type | wave | depends_on | files_modified | autonomous | requirements | must_haves | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 04-input-sources | 03 | execute | 1 |
|
|
true |
|
|
Purpose: Leaked keys often exist only in git history — deleted from HEAD but still reachable via old commits. A one-shot HEAD scan misses them. This source walks the full commit graph using go-git/v5 with blob-level deduplication so a 10k-commit repo with 200k historical files scans in minutes, not hours.
Output: pkg/engine/sources/git.go and git_test.go. Wired into CLI in plan 04-05.
<execution_context> @$HOME/.claude/get-shit-done/workflows/execute-plan.md @$HOME/.claude/get-shit-done/templates/summary.md </execution_context>
@.planning/PROJECT.md @.planning/phases/04-input-sources/04-CONTEXT.md @pkg/engine/sources/source.go @pkg/types/chunk.go Source interface: ```go type Source interface { Chunks(ctx context.Context, out chan<- types.Chunk) error } ```Chunk struct:
type Chunk struct {
Data []byte
Source string // will be "git:<shortSHA>:<path>"
Offset int64
}
Relevant go-git/v5 APIs (from https://pkg.go.dev/github.com/go-git/go-git/v5):
import "github.com/go-git/go-git/v5"
import "github.com/go-git/go-git/v5/plumbing"
import "github.com/go-git/go-git/v5/plumbing/object"
repo, err := git.PlainOpen(path) // opens local repo
refs, err := repo.References() // iterator over refs
refs.ForEach(func(*plumbing.Reference) error { }) // walk refs
commit, err := repo.CommitObject(hash) // resolve commit
iter, err := repo.Log(&git.LogOptions{From: hash, All: false})
iter.ForEach(func(*object.Commit) error { }) // walk commits
tree, err := commit.Tree()
tree.Files().ForEach(func(*object.File) error { }) // walk blobs
file.Contents() // returns (string, error)
file.Binary() // (bool, error)
file.Hash // plumbing.Hash (blob OID)
emitChunks helper from 04-02 plan (pkg/engine/sources/dir.go) — reuse:
func emitChunks(ctx context.Context, data []byte, source string, chunkSize int, out chan<- types.Chunk) error
package sources
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"time"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-git/go-git/v5/plumbing/object"
"github.com/salvacybersec/keyhunter/pkg/types"
)
// GitSource scans the full history of a local git repository: every commit
// on every branch and tag, deduplicating blob scans by OID.
type GitSource struct {
// RepoPath is the path to the local git repo (working tree or bare).
RepoPath string
// Since, if non-zero, excludes commits older than this timestamp
// (using commit author date).
Since time.Time
// ChunkSize is the overlap-chunker size; zero uses defaultChunkSize.
ChunkSize int
}
// NewGitSource creates a GitSource for the given repo path.
func NewGitSource(repoPath string) *GitSource {
return &GitSource{RepoPath: repoPath, ChunkSize: defaultChunkSize}
}
// Chunks walks every commit reachable from every branch, tag, and the
// stash ref (if present), streaming each unique blob's content through
// the shared emitChunks helper.
func (g *GitSource) Chunks(ctx context.Context, out chan<- types.Chunk) error {
if g.RepoPath == "" {
return errors.New("GitSource: RepoPath is empty")
}
repo, err := git.PlainOpen(g.RepoPath)
if err != nil {
return fmt.Errorf("GitSource: open %q: %w", g.RepoPath, err)
}
// Collect commit hashes to walk from every ref under refs/heads, refs/tags, refs/stash.
seedCommits, err := collectSeedCommits(repo)
if err != nil {
return fmt.Errorf("GitSource: collect refs: %w", err)
}
if len(seedCommits) == 0 {
return nil // empty repo is not an error
}
seenCommits := make(map[plumbing.Hash]struct{})
seenBlobs := make(map[plumbing.Hash]struct{})
for _, seed := range seedCommits {
if err := ctx.Err(); err != nil {
return err
}
iter, err := repo.Log(&git.LogOptions{From: seed, All: false})
if err != nil {
continue
}
err = iter.ForEach(func(c *object.Commit) error {
if ctxErr := ctx.Err(); ctxErr != nil {
return ctxErr
}
if _, ok := seenCommits[c.Hash]; ok {
return nil
}
seenCommits[c.Hash] = struct{}{}
if !g.Since.IsZero() && c.Author.When.Before(g.Since) {
return nil
}
return g.emitCommitBlobs(ctx, c, seenBlobs, out)
})
iter.Close()
if err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return err
}
// Swallow per-seed iterator errors; continue with other refs.
}
}
return nil
}
// collectSeedCommits gathers commit hashes from all local branches, tags,
// and the stash ref — the union of which reaches every commit worth scanning.
func collectSeedCommits(repo *git.Repository) ([]plumbing.Hash, error) {
var seeds []plumbing.Hash
refs, err := repo.References()
if err != nil {
return nil, err
}
err = refs.ForEach(func(ref *plumbing.Reference) error {
name := ref.Name()
if !(name.IsBranch() || name.IsTag() || name == plumbing.ReferenceName("refs/stash") || name.IsRemote()) {
return nil
}
hash := ref.Hash()
// For annotated tags the ref points at a tag object; resolve to commit if possible.
if name.IsTag() {
if tag, err := repo.TagObject(hash); err == nil {
if c, err := tag.Commit(); err == nil {
hash = c.Hash
}
}
}
// Skip symbolic refs (HEAD) whose target we already walked via IsBranch.
seeds = append(seeds, hash)
return nil
})
return seeds, err
}
// emitCommitBlobs walks the tree of a commit and emits every blob whose
// OID has not already been scanned.
func (g *GitSource) emitCommitBlobs(ctx context.Context, c *object.Commit, seenBlobs map[plumbing.Hash]struct{}, out chan<- types.Chunk) error {
tree, err := c.Tree()
if err != nil {
return nil // skip unreadable tree
}
shortSHA := c.Hash.String()[:7]
return tree.Files().ForEach(func(f *object.File) error {
if err := ctx.Err(); err != nil {
return err
}
if _, ok := seenBlobs[f.Hash]; ok {
return nil
}
seenBlobs[f.Hash] = struct{}{}
// Skip obviously-binary blobs via go-git's helper, then via our sniff.
if isBin, _ := f.IsBinary(); isBin {
return nil
}
reader, err := f.Reader()
if err != nil {
return nil
}
defer reader.Close()
data, err := io.ReadAll(reader)
if err != nil {
return nil
}
if len(data) == 0 {
return nil
}
if bytes.IndexByte(data[:minInt(len(data), BinarySniffSize)], 0x00) >= 0 {
return nil
}
source := fmt.Sprintf("git:%s:%s", shortSHA, f.Name)
return emitChunks(ctx, data, source, g.ChunkSize, out)
})
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
Create pkg/engine/sources/git_test.go using go-git's in-process fixtures:
package sources
import (
"context"
"os"
"path/filepath"
"regexp"
"testing"
"time"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing/object"
"github.com/stretchr/testify/require"
"github.com/salvacybersec/keyhunter/pkg/types"
)
func initRepo(t *testing.T) (string, *git.Repository) {
t.Helper()
dir := t.TempDir()
repo, err := git.PlainInit(dir, false)
require.NoError(t, err)
return dir, repo
}
func commitFile(t *testing.T, dir string, repo *git.Repository, name, content string) {
t.Helper()
path := filepath.Join(dir, name)
require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755))
require.NoError(t, os.WriteFile(path, []byte(content), 0o644))
wt, err := repo.Worktree()
require.NoError(t, err)
_, err = wt.Add(name)
require.NoError(t, err)
_, err = wt.Commit("add "+name, &git.CommitOptions{
Author: &object.Signature{Name: "test", Email: "t@x", When: time.Now()},
})
require.NoError(t, err)
}
func drainGit(t *testing.T, src Source) []types.Chunk {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
out := make(chan types.Chunk, 1024)
errCh := make(chan error, 1)
go func() { errCh <- src.Chunks(ctx, out); close(out) }()
var got []types.Chunk
for c := range out {
got = append(got, c)
}
require.NoError(t, <-errCh)
return got
}
func TestGitSource_HistoryWalk(t *testing.T) {
dir, repo := initRepo(t)
commitFile(t, dir, repo, "a.txt", "contents alpha")
commitFile(t, dir, repo, "b.txt", "contents bravo")
commitFile(t, dir, repo, "c.txt", "contents charlie")
chunks := drainGit(t, NewGitSource(dir))
require.GreaterOrEqual(t, len(chunks), 3)
re := regexp.MustCompile(`^git:[0-9a-f]{7}:.+$`)
for _, c := range chunks {
require.Regexp(t, re, c.Source)
}
}
func TestGitSource_BlobDeduplication(t *testing.T) {
dir, repo := initRepo(t)
commitFile(t, dir, repo, "a.txt", "same exact content everywhere")
commitFile(t, dir, repo, "b.txt", "same exact content everywhere") // identical blob -> same OID
commitFile(t, dir, repo, "c.txt", "different content here")
chunks := drainGit(t, NewGitSource(dir))
// Expect 2 unique blobs scanned, not 3 files.
unique := make(map[string]bool)
for _, c := range chunks {
unique[string(c.Data)] = true
}
require.Len(t, unique, 2, "duplicate blobs must be deduped by OID")
}
func TestGitSource_ModifiedFileKeepsBothVersions(t *testing.T) {
dir, repo := initRepo(t)
commitFile(t, dir, repo, "a.txt", "version one")
commitFile(t, dir, repo, "a.txt", "version two") // modifying produces a second blob
chunks := drainGit(t, NewGitSource(dir))
bodies := make(map[string]bool)
for _, c := range chunks {
bodies[string(c.Data)] = true
}
require.True(t, bodies["version one"], "old version must still be scanned")
require.True(t, bodies["version two"], "new version must be scanned")
}
func TestGitSource_SinceFilterExcludesAll(t *testing.T) {
dir, repo := initRepo(t)
commitFile(t, dir, repo, "a.txt", "alpha")
src := NewGitSource(dir)
src.Since = time.Now().Add(1 * time.Hour)
chunks := drainGit(t, src)
require.Empty(t, chunks)
}
func TestGitSource_MissingRepo(t *testing.T) {
src := NewGitSource(filepath.Join(t.TempDir(), "not-a-repo"))
ctx := context.Background()
out := make(chan types.Chunk, 1)
err := src.Chunks(ctx, out)
require.Error(t, err)
}
Do NOT touch any file outside pkg/engine/sources/git.go and pkg/engine/sources/git_test.go. CLI wire-up happens in plan 04-05.
go test ./pkg/engine/sources/... -run TestGitSource -race -count=1 -timeout=60s
<acceptance_criteria>
- go build ./pkg/engine/sources/... exits 0
- go test ./pkg/engine/sources/... -run TestGitSource -race -count=1 passes all subtests
- grep -n "git.PlainOpen" pkg/engine/sources/git.go returns a hit
- grep -n "seenBlobs" pkg/engine/sources/git.go returns a hit (dedup map)
- grep -n "fmt.Sprintf(\"git:%s:%s\"" pkg/engine/sources/git.go returns a hit
- grep -n "g.Since" pkg/engine/sources/git.go returns a hit
</acceptance_criteria>
GitSource walks all branches/tags, emits each unique blob once, honors Since filter, formats source as git:<short-sha>:<path>, and tests cover dedup/history/since/missing-repo.
<success_criteria>
A caller can sources.NewGitSource("./myrepo") and receive chunks for every historical blob across all refs, with deterministic dedup and source attribution in git:<sha>:<path> form.
</success_criteria>