package sources import ( "context" "os" "path/filepath" "regexp" "testing" "time" "github.com/go-git/go-git/v5" "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/object" "github.com/stretchr/testify/require" "github.com/salvacybersec/keyhunter/pkg/types" ) func initRepo(t *testing.T) (string, *git.Repository) { t.Helper() dir := t.TempDir() repo, err := git.PlainInit(dir, false) require.NoError(t, err) return dir, repo } func commitFile(t *testing.T, dir string, repo *git.Repository, name, content string) plumbing.Hash { t.Helper() path := filepath.Join(dir, name) require.NoError(t, os.MkdirAll(filepath.Dir(path), 0o755)) require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) wt, err := repo.Worktree() require.NoError(t, err) _, err = wt.Add(name) require.NoError(t, err) h, err := wt.Commit("add "+name, &git.CommitOptions{ Author: &object.Signature{Name: "test", Email: "t@x", When: time.Now()}, }) require.NoError(t, err) return h } func drainGit(t *testing.T, src Source) []types.Chunk { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() out := make(chan types.Chunk, 1024) errCh := make(chan error, 1) go func() { errCh <- src.Chunks(ctx, out); close(out) }() var got []types.Chunk for c := range out { got = append(got, c) } require.NoError(t, <-errCh) return got } func TestGitSource_HistoryWalk(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "a.txt", "contents alpha") commitFile(t, dir, repo, "b.txt", "contents bravo") commitFile(t, dir, repo, "c.txt", "contents charlie") chunks := drainGit(t, NewGitSource(dir)) require.GreaterOrEqual(t, len(chunks), 3) re := regexp.MustCompile(`^git:[0-9a-f]{7}:.+$`) for _, c := range chunks { require.Regexp(t, re, c.Source) } } func TestGitSource_BlobDeduplication(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "a.txt", "same exact content everywhere") commitFile(t, dir, repo, "b.txt", "same exact content everywhere") // identical blob -> same OID commitFile(t, dir, repo, "c.txt", "different content here") chunks := drainGit(t, NewGitSource(dir)) // Expect 2 unique blob contents scanned across all commits, not 3 per commit. unique := make(map[string]bool) for _, c := range chunks { unique[string(c.Data)] = true } require.Len(t, unique, 2, "duplicate blobs must be deduped by OID") } func TestGitSource_ModifiedFileKeepsBothVersions(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "a.txt", "version one") commitFile(t, dir, repo, "a.txt", "version two") // modifying produces a second blob chunks := drainGit(t, NewGitSource(dir)) bodies := make(map[string]bool) for _, c := range chunks { bodies[string(c.Data)] = true } require.True(t, bodies["version one"], "old version must still be scanned") require.True(t, bodies["version two"], "new version must be scanned") } func TestGitSource_MultiBranch(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "base.txt", "base content") // Capture current branch to come back to it. head, err := repo.Head() require.NoError(t, err) baseBranch := head.Name() // Create and checkout a new branch "feature". wt, err := repo.Worktree() require.NoError(t, err) featureRef := plumbing.NewBranchReferenceName("feature") require.NoError(t, wt.Checkout(&git.CheckoutOptions{ Branch: featureRef, Create: true, })) commitFile(t, dir, repo, "feature_only.txt", "feature branch content") // Switch back to base branch and add another file. require.NoError(t, wt.Checkout(&git.CheckoutOptions{Branch: baseBranch})) commitFile(t, dir, repo, "main_only.txt", "main branch content") chunks := drainGit(t, NewGitSource(dir)) bodies := make(map[string]bool) for _, c := range chunks { bodies[string(c.Data)] = true } require.True(t, bodies["base content"], "base blob must be scanned") require.True(t, bodies["feature branch content"], "feature branch blob must be scanned") require.True(t, bodies["main branch content"], "main branch blob must be scanned") } func TestGitSource_TagReachesOldCommit(t *testing.T) { dir, repo := initRepo(t) oldHash := commitFile(t, dir, repo, "old.txt", "old tagged content") // Tag the old commit. _, err := repo.CreateTag("v0.1", oldHash, nil) require.NoError(t, err) // Add more commits after the tag. commitFile(t, dir, repo, "new.txt", "later content") chunks := drainGit(t, NewGitSource(dir)) bodies := make(map[string]bool) for _, c := range chunks { bodies[string(c.Data)] = true } require.True(t, bodies["old tagged content"], "tagged commit's blobs must be reachable") require.True(t, bodies["later content"]) } func TestGitSource_SinceFilterExcludesAll(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "a.txt", "alpha") src := NewGitSource(dir) src.Since = time.Now().Add(1 * time.Hour) chunks := drainGit(t, src) require.Empty(t, chunks) } func TestGitSource_SourceFormat(t *testing.T) { dir, repo := initRepo(t) commitFile(t, dir, repo, "path/to/file.txt", "some content") chunks := drainGit(t, NewGitSource(dir)) require.NotEmpty(t, chunks) re := regexp.MustCompile(`^git:[0-9a-f]{7}:path/to/file\.txt$`) matched := false for _, c := range chunks { if re.MatchString(c.Source) { matched = true break } } require.True(t, matched, "expected a chunk with source matching git::path/to/file.txt") } func TestGitSource_MissingRepo(t *testing.T) { src := NewGitSource(filepath.Join(t.TempDir(), "not-a-repo")) ctx := context.Background() out := make(chan types.Chunk, 1) err := src.Chunks(ctx, out) require.Error(t, err) }