From 6a3d5b0cb7a055a38f9206161d341d32e4195031 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 23:54:44 +0300 Subject: [PATCH] feat(07-03): dedup helper for imported findings - FindingKey: stable SHA-256 over provider+masked+source+line - Dedup: preserves first-seen order, returns drop count - 8 unit tests covering stability, field sensitivity, order preservation --- pkg/importer/dedup.go | 42 +++++++++++++++ pkg/importer/dedup_test.go | 106 +++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 pkg/importer/dedup.go create mode 100644 pkg/importer/dedup_test.go diff --git a/pkg/importer/dedup.go b/pkg/importer/dedup.go new file mode 100644 index 0000000..aa6488b --- /dev/null +++ b/pkg/importer/dedup.go @@ -0,0 +1,42 @@ +package importer + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +// FindingKey returns a stable identity hash for a finding based on the +// provider name, masked key, source path, and line number. This is the +// dedup identity used by import pipelines so the same underlying secret +// is not inserted twice when re-importing the same scanner output. +// +// Fields outside this tuple (DetectedAt, Confidence, VerifyStatus, ...) +// intentionally do not contribute to the key: re-running the same import +// at a later time must collapse onto the original finding. +func FindingKey(f engine.Finding) string { + payload := fmt.Sprintf("%s\x00%s\x00%s\x00%d", f.ProviderName, f.KeyMasked, f.Source, f.LineNumber) + sum := sha256.Sum256([]byte(payload)) + return hex.EncodeToString(sum[:]) +} + +// Dedup removes duplicate findings from in-memory slices before insert. +// Order of first-seen findings is preserved. Returns the deduplicated +// slice and the number of duplicates dropped. +func Dedup(in []engine.Finding) ([]engine.Finding, int) { + seen := make(map[string]struct{}, len(in)) + out := make([]engine.Finding, 0, len(in)) + dropped := 0 + for _, f := range in { + k := FindingKey(f) + if _, ok := seen[k]; ok { + dropped++ + continue + } + seen[k] = struct{}{} + out = append(out, f) + } + return out, dropped +} diff --git a/pkg/importer/dedup_test.go b/pkg/importer/dedup_test.go new file mode 100644 index 0000000..6f324d6 --- /dev/null +++ b/pkg/importer/dedup_test.go @@ -0,0 +1,106 @@ +package importer + +import ( + "testing" + "time" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +func mkFinding(provider, masked, source string, line int) engine.Finding { + return engine.Finding{ + ProviderName: provider, + KeyMasked: masked, + Source: source, + LineNumber: line, + } +} + +func TestFindingKey_Stable(t *testing.T) { + f := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + if FindingKey(f) != FindingKey(f) { + t.Fatal("FindingKey must be deterministic for identical input") + } +} + +func TestFindingKey_DiffersByProvider(t *testing.T) { + a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + b := mkFinding("anthropic", "sk-abc12...wxyz", "src/app.go", 42) + if FindingKey(a) == FindingKey(b) { + t.Fatal("different providers must yield different keys") + } +} + +func TestFindingKey_DiffersByMasked(t *testing.T) { + a := mkFinding("openai", "sk-aaaaa...wxyz", "src/app.go", 42) + b := mkFinding("openai", "sk-bbbbb...wxyz", "src/app.go", 42) + if FindingKey(a) == FindingKey(b) { + t.Fatal("different masked keys must yield different keys") + } +} + +func TestFindingKey_DiffersBySource(t *testing.T) { + a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + b := mkFinding("openai", "sk-abc12...wxyz", "src/other.go", 42) + if FindingKey(a) == FindingKey(b) { + t.Fatal("different sources must yield different keys") + } +} + +func TestFindingKey_DiffersByLine(t *testing.T) { + a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + b := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 43) + if FindingKey(a) == FindingKey(b) { + t.Fatal("different line numbers must yield different keys") + } +} + +func TestDedup_PreservesOrder(t *testing.T) { + a := mkFinding("openai", "sk-aaa...0001", "file_a.go", 1) + b := mkFinding("openai", "sk-bbb...0002", "file_b.go", 2) + c := mkFinding("openai", "sk-ccc...0003", "file_c.go", 3) + in := []engine.Finding{a, b, a, c, b} + + out, dropped := Dedup(in) + + if dropped != 2 { + t.Fatalf("expected 2 dropped, got %d", dropped) + } + if len(out) != 3 { + t.Fatalf("expected 3 unique, got %d", len(out)) + } + if out[0].KeyMasked != a.KeyMasked || out[1].KeyMasked != b.KeyMasked || out[2].KeyMasked != c.KeyMasked { + t.Fatalf("dedup did not preserve first-seen order: %+v", out) + } +} + +func TestDedup_Empty(t *testing.T) { + out, dropped := Dedup(nil) + if dropped != 0 { + t.Fatalf("expected 0 dropped, got %d", dropped) + } + if len(out) != 0 { + t.Fatalf("expected empty output, got %d", len(out)) + } +} + +func TestDedup_IgnoresUnrelatedFields(t *testing.T) { + base := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + base.DetectedAt = time.Now() + base.Confidence = "high" + + twin := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42) + twin.DetectedAt = base.DetectedAt.Add(24 * time.Hour) + twin.Confidence = "low" + + out, dropped := Dedup([]engine.Finding{base, twin}) + if dropped != 1 { + t.Fatalf("expected 1 dropped, got %d", dropped) + } + if len(out) != 1 { + t.Fatalf("expected 1 kept, got %d", len(out)) + } + if out[0].Confidence != "high" { + t.Fatalf("expected first-seen to be kept, got Confidence=%s", out[0].Confidence) + } +}