diff --git a/pkg/recon/dedup.go b/pkg/recon/dedup.go new file mode 100644 index 0000000..64806ab --- /dev/null +++ b/pkg/recon/dedup.go @@ -0,0 +1,41 @@ +package recon + +import ( + "crypto/sha256" + "encoding/hex" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +// Dedup removes duplicate findings from a recon sweep using +// SHA256(ProviderName|KeyMasked|Source) as the dedup key. +// +// The operation is stable: input order is preserved and first-seen metadata +// (DetectedAt, Confidence, VerifyStatus, etc.) wins when a later duplicate +// would otherwise overwrite it. Two findings with the same provider and +// masked key but different Source URLs are kept separate, so callers can +// see every distinct location where a leaked key was found. +// +// Callers (e.g. Engine.SweepAll from Plan 09-01) should invoke Dedup on the +// aggregated finding slice before persisting to storage. A nil or empty +// input returns nil. +// +// Note: this package uses engine.Finding directly rather than a local alias +// so it compiles independently of Plan 09-01 during parallel execution. +func Dedup(in []engine.Finding) []engine.Finding { + if len(in) == 0 { + return nil + } + seen := make(map[string]struct{}, len(in)) + out := make([]engine.Finding, 0, len(in)) + for _, f := range in { + sum := sha256.Sum256([]byte(f.ProviderName + "|" + f.KeyMasked + "|" + f.Source)) + key := hex.EncodeToString(sum[:]) + if _, dup := seen[key]; dup { + continue + } + seen[key] = struct{}{} + out = append(out, f) + } + return out +}