feat(07-03): dedup helper for imported findings
- FindingKey: stable SHA-256 over provider+masked+source+line - Dedup: preserves first-seen order, returns drop count - 8 unit tests covering stability, field sensitivity, order preservation
This commit is contained in:
42
pkg/importer/dedup.go
Normal file
42
pkg/importer/dedup.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package importer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/sha256"
|
||||||
|
"encoding/hex"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||||
|
)
|
||||||
|
|
||||||
|
// FindingKey returns a stable identity hash for a finding based on the
|
||||||
|
// provider name, masked key, source path, and line number. This is the
|
||||||
|
// dedup identity used by import pipelines so the same underlying secret
|
||||||
|
// is not inserted twice when re-importing the same scanner output.
|
||||||
|
//
|
||||||
|
// Fields outside this tuple (DetectedAt, Confidence, VerifyStatus, ...)
|
||||||
|
// intentionally do not contribute to the key: re-running the same import
|
||||||
|
// at a later time must collapse onto the original finding.
|
||||||
|
func FindingKey(f engine.Finding) string {
|
||||||
|
payload := fmt.Sprintf("%s\x00%s\x00%s\x00%d", f.ProviderName, f.KeyMasked, f.Source, f.LineNumber)
|
||||||
|
sum := sha256.Sum256([]byte(payload))
|
||||||
|
return hex.EncodeToString(sum[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dedup removes duplicate findings from in-memory slices before insert.
|
||||||
|
// Order of first-seen findings is preserved. Returns the deduplicated
|
||||||
|
// slice and the number of duplicates dropped.
|
||||||
|
func Dedup(in []engine.Finding) ([]engine.Finding, int) {
|
||||||
|
seen := make(map[string]struct{}, len(in))
|
||||||
|
out := make([]engine.Finding, 0, len(in))
|
||||||
|
dropped := 0
|
||||||
|
for _, f := range in {
|
||||||
|
k := FindingKey(f)
|
||||||
|
if _, ok := seen[k]; ok {
|
||||||
|
dropped++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[k] = struct{}{}
|
||||||
|
out = append(out, f)
|
||||||
|
}
|
||||||
|
return out, dropped
|
||||||
|
}
|
||||||
106
pkg/importer/dedup_test.go
Normal file
106
pkg/importer/dedup_test.go
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
package importer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||||
|
)
|
||||||
|
|
||||||
|
func mkFinding(provider, masked, source string, line int) engine.Finding {
|
||||||
|
return engine.Finding{
|
||||||
|
ProviderName: provider,
|
||||||
|
KeyMasked: masked,
|
||||||
|
Source: source,
|
||||||
|
LineNumber: line,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFindingKey_Stable(t *testing.T) {
|
||||||
|
f := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
if FindingKey(f) != FindingKey(f) {
|
||||||
|
t.Fatal("FindingKey must be deterministic for identical input")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFindingKey_DiffersByProvider(t *testing.T) {
|
||||||
|
a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
b := mkFinding("anthropic", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
if FindingKey(a) == FindingKey(b) {
|
||||||
|
t.Fatal("different providers must yield different keys")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFindingKey_DiffersByMasked(t *testing.T) {
|
||||||
|
a := mkFinding("openai", "sk-aaaaa...wxyz", "src/app.go", 42)
|
||||||
|
b := mkFinding("openai", "sk-bbbbb...wxyz", "src/app.go", 42)
|
||||||
|
if FindingKey(a) == FindingKey(b) {
|
||||||
|
t.Fatal("different masked keys must yield different keys")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFindingKey_DiffersBySource(t *testing.T) {
|
||||||
|
a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
b := mkFinding("openai", "sk-abc12...wxyz", "src/other.go", 42)
|
||||||
|
if FindingKey(a) == FindingKey(b) {
|
||||||
|
t.Fatal("different sources must yield different keys")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFindingKey_DiffersByLine(t *testing.T) {
|
||||||
|
a := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
b := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 43)
|
||||||
|
if FindingKey(a) == FindingKey(b) {
|
||||||
|
t.Fatal("different line numbers must yield different keys")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDedup_PreservesOrder(t *testing.T) {
|
||||||
|
a := mkFinding("openai", "sk-aaa...0001", "file_a.go", 1)
|
||||||
|
b := mkFinding("openai", "sk-bbb...0002", "file_b.go", 2)
|
||||||
|
c := mkFinding("openai", "sk-ccc...0003", "file_c.go", 3)
|
||||||
|
in := []engine.Finding{a, b, a, c, b}
|
||||||
|
|
||||||
|
out, dropped := Dedup(in)
|
||||||
|
|
||||||
|
if dropped != 2 {
|
||||||
|
t.Fatalf("expected 2 dropped, got %d", dropped)
|
||||||
|
}
|
||||||
|
if len(out) != 3 {
|
||||||
|
t.Fatalf("expected 3 unique, got %d", len(out))
|
||||||
|
}
|
||||||
|
if out[0].KeyMasked != a.KeyMasked || out[1].KeyMasked != b.KeyMasked || out[2].KeyMasked != c.KeyMasked {
|
||||||
|
t.Fatalf("dedup did not preserve first-seen order: %+v", out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDedup_Empty(t *testing.T) {
|
||||||
|
out, dropped := Dedup(nil)
|
||||||
|
if dropped != 0 {
|
||||||
|
t.Fatalf("expected 0 dropped, got %d", dropped)
|
||||||
|
}
|
||||||
|
if len(out) != 0 {
|
||||||
|
t.Fatalf("expected empty output, got %d", len(out))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDedup_IgnoresUnrelatedFields(t *testing.T) {
|
||||||
|
base := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
base.DetectedAt = time.Now()
|
||||||
|
base.Confidence = "high"
|
||||||
|
|
||||||
|
twin := mkFinding("openai", "sk-abc12...wxyz", "src/app.go", 42)
|
||||||
|
twin.DetectedAt = base.DetectedAt.Add(24 * time.Hour)
|
||||||
|
twin.Confidence = "low"
|
||||||
|
|
||||||
|
out, dropped := Dedup([]engine.Finding{base, twin})
|
||||||
|
if dropped != 1 {
|
||||||
|
t.Fatalf("expected 1 dropped, got %d", dropped)
|
||||||
|
}
|
||||||
|
if len(out) != 1 {
|
||||||
|
t.Fatalf("expected 1 kept, got %d", len(out))
|
||||||
|
}
|
||||||
|
if out[0].Confidence != "high" {
|
||||||
|
t.Fatalf("expected first-seen to be kept, got Confidence=%s", out[0].Confidence)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user