- Walks every commit across branches, tags, remote-tracking refs, and stash - Deduplicates blob scans by OID (seenBlobs map) so identical content across commits/files is scanned exactly once - Emits chunks with source format git:<short-sha>:<path> - Honors --since filter via GitSource.Since (commit author date) - Resolves annotated tag objects down to their commit hash - Skips binary blobs via go-git IsBinary plus null-byte sniff - 8 subtests cover history walk, dedup, modified-file, multi-branch, tag reachability, since filter, source format, missing repo
187 lines
5.6 KiB
Go
187 lines
5.6 KiB
Go
package sources
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/go-git/go-git/v5"
|
|
"github.com/go-git/go-git/v5/plumbing"
|
|
"github.com/go-git/go-git/v5/plumbing/object"
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/salvacybersec/keyhunter/pkg/types"
|
|
)
|
|
|
|
func initRepo(t *testing.T) (string, *git.Repository) {
|
|
t.Helper()
|
|
dir := t.TempDir()
|
|
repo, err := git.PlainInit(dir, false)
|
|
require.NoError(t, err)
|
|
return dir, repo
|
|
}
|
|
|
|
func commitFile(t *testing.T, dir string, repo *git.Repository, name, content string) plumbing.Hash {
|
|
t.Helper()
|
|
path := filepath.Join(dir, name)
|
|
require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755))
|
|
require.NoError(t, os.WriteFile(path, []byte(content), 0o644))
|
|
wt, err := repo.Worktree()
|
|
require.NoError(t, err)
|
|
_, err = wt.Add(name)
|
|
require.NoError(t, err)
|
|
h, err := wt.Commit("add "+name, &git.CommitOptions{
|
|
Author: &object.Signature{Name: "test", Email: "t@x", When: time.Now()},
|
|
})
|
|
require.NoError(t, err)
|
|
return h
|
|
}
|
|
|
|
func drainGit(t *testing.T, src Source) []types.Chunk {
|
|
t.Helper()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
out := make(chan types.Chunk, 1024)
|
|
errCh := make(chan error, 1)
|
|
go func() { errCh <- src.Chunks(ctx, out); close(out) }()
|
|
var got []types.Chunk
|
|
for c := range out {
|
|
got = append(got, c)
|
|
}
|
|
require.NoError(t, <-errCh)
|
|
return got
|
|
}
|
|
|
|
func TestGitSource_HistoryWalk(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "a.txt", "contents alpha")
|
|
commitFile(t, dir, repo, "b.txt", "contents bravo")
|
|
commitFile(t, dir, repo, "c.txt", "contents charlie")
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
require.GreaterOrEqual(t, len(chunks), 3)
|
|
|
|
re := regexp.MustCompile(`^git:[0-9a-f]{7}:.+$`)
|
|
for _, c := range chunks {
|
|
require.Regexp(t, re, c.Source)
|
|
}
|
|
}
|
|
|
|
func TestGitSource_BlobDeduplication(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "a.txt", "same exact content everywhere")
|
|
commitFile(t, dir, repo, "b.txt", "same exact content everywhere") // identical blob -> same OID
|
|
commitFile(t, dir, repo, "c.txt", "different content here")
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
// Expect 2 unique blob contents scanned across all commits, not 3 per commit.
|
|
unique := make(map[string]bool)
|
|
for _, c := range chunks {
|
|
unique[string(c.Data)] = true
|
|
}
|
|
require.Len(t, unique, 2, "duplicate blobs must be deduped by OID")
|
|
}
|
|
|
|
func TestGitSource_ModifiedFileKeepsBothVersions(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "a.txt", "version one")
|
|
commitFile(t, dir, repo, "a.txt", "version two") // modifying produces a second blob
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
bodies := make(map[string]bool)
|
|
for _, c := range chunks {
|
|
bodies[string(c.Data)] = true
|
|
}
|
|
require.True(t, bodies["version one"], "old version must still be scanned")
|
|
require.True(t, bodies["version two"], "new version must be scanned")
|
|
}
|
|
|
|
func TestGitSource_MultiBranch(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "base.txt", "base content")
|
|
|
|
// Capture current branch to come back to it.
|
|
head, err := repo.Head()
|
|
require.NoError(t, err)
|
|
baseBranch := head.Name()
|
|
|
|
// Create and checkout a new branch "feature".
|
|
wt, err := repo.Worktree()
|
|
require.NoError(t, err)
|
|
featureRef := plumbing.NewBranchReferenceName("feature")
|
|
require.NoError(t, wt.Checkout(&git.CheckoutOptions{
|
|
Branch: featureRef,
|
|
Create: true,
|
|
}))
|
|
commitFile(t, dir, repo, "feature_only.txt", "feature branch content")
|
|
|
|
// Switch back to base branch and add another file.
|
|
require.NoError(t, wt.Checkout(&git.CheckoutOptions{Branch: baseBranch}))
|
|
commitFile(t, dir, repo, "main_only.txt", "main branch content")
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
bodies := make(map[string]bool)
|
|
for _, c := range chunks {
|
|
bodies[string(c.Data)] = true
|
|
}
|
|
require.True(t, bodies["base content"], "base blob must be scanned")
|
|
require.True(t, bodies["feature branch content"], "feature branch blob must be scanned")
|
|
require.True(t, bodies["main branch content"], "main branch blob must be scanned")
|
|
}
|
|
|
|
func TestGitSource_TagReachesOldCommit(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
oldHash := commitFile(t, dir, repo, "old.txt", "old tagged content")
|
|
// Tag the old commit.
|
|
_, err := repo.CreateTag("v0.1", oldHash, nil)
|
|
require.NoError(t, err)
|
|
// Add more commits after the tag.
|
|
commitFile(t, dir, repo, "new.txt", "later content")
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
bodies := make(map[string]bool)
|
|
for _, c := range chunks {
|
|
bodies[string(c.Data)] = true
|
|
}
|
|
require.True(t, bodies["old tagged content"], "tagged commit's blobs must be reachable")
|
|
require.True(t, bodies["later content"])
|
|
}
|
|
|
|
func TestGitSource_SinceFilterExcludesAll(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "a.txt", "alpha")
|
|
|
|
src := NewGitSource(dir)
|
|
src.Since = time.Now().Add(1 * time.Hour)
|
|
chunks := drainGit(t, src)
|
|
require.Empty(t, chunks)
|
|
}
|
|
|
|
func TestGitSource_SourceFormat(t *testing.T) {
|
|
dir, repo := initRepo(t)
|
|
commitFile(t, dir, repo, "path/to/file.txt", "some content")
|
|
|
|
chunks := drainGit(t, NewGitSource(dir))
|
|
require.NotEmpty(t, chunks)
|
|
re := regexp.MustCompile(`^git:[0-9a-f]{7}:path/to/file\.txt$`)
|
|
matched := false
|
|
for _, c := range chunks {
|
|
if re.MatchString(c.Source) {
|
|
matched = true
|
|
break
|
|
}
|
|
}
|
|
require.True(t, matched, "expected a chunk with source matching git:<sha>:path/to/file.txt")
|
|
}
|
|
|
|
func TestGitSource_MissingRepo(t *testing.T) {
|
|
src := NewGitSource(filepath.Join(t.TempDir(), "not-a-repo"))
|
|
ctx := context.Background()
|
|
out := make(chan types.Chunk, 1)
|
|
err := src.Chunks(ctx, out)
|
|
require.Error(t, err)
|
|
}
|