From 45cc676f55ba96024307d4c8dc4f162321bdb792 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 12:18:26 +0300 Subject: [PATCH] feat(01-04): add shared Chunk type, Finding struct, Shannon entropy, and MaskKey - pkg/types/chunk.go: shared Chunk struct breaking engine<->sources circular import - pkg/engine/finding.go: Finding struct with MaskKey for pipeline output - pkg/engine/entropy.go: Shannon entropy function using math.Log2 - pkg/engine/entropy_test.go: TDD tests for Shannon and MaskKey --- pkg/engine/entropy.go | 23 +++++++++++++++++++++++ pkg/engine/entropy_test.go | 31 +++++++++++++++++++++++++++++++ pkg/engine/finding.go | 26 ++++++++++++++++++++++++++ pkg/types/chunk.go | 10 ++++++++++ 4 files changed, 90 insertions(+) create mode 100644 pkg/engine/entropy.go create mode 100644 pkg/engine/entropy_test.go create mode 100644 pkg/engine/finding.go create mode 100644 pkg/types/chunk.go diff --git a/pkg/engine/entropy.go b/pkg/engine/entropy.go new file mode 100644 index 0000000..528b50f --- /dev/null +++ b/pkg/engine/entropy.go @@ -0,0 +1,23 @@ +package engine + +import "math" + +// Shannon computes the Shannon entropy of a string in bits per character. +// Returns 0.0 for empty strings. +// A value >= 3.5 indicates high randomness, consistent with real API keys. +func Shannon(s string) float64 { + if len(s) == 0 { + return 0.0 + } + freq := make(map[rune]float64) + for _, c := range s { + freq[c]++ + } + n := float64(len([]rune(s))) + var entropy float64 + for _, count := range freq { + p := count / n + entropy -= p * math.Log2(p) + } + return entropy +} diff --git a/pkg/engine/entropy_test.go b/pkg/engine/entropy_test.go new file mode 100644 index 0000000..20f7a01 --- /dev/null +++ b/pkg/engine/entropy_test.go @@ -0,0 +1,31 @@ +package engine + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestShannonAllSame(t *testing.T) { + assert.InDelta(t, 0.0, Shannon("aaaaaaa"), 0.01) +} + +func TestShannonDistinct(t *testing.T) { + assert.InDelta(t, 3.0, Shannon("abcdefgh"), 0.1) +} + +func TestShannonRealKey(t *testing.T) { + assert.GreaterOrEqual(t, Shannon("sk-proj-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqr"), 3.5) +} + +func TestShannonEmpty(t *testing.T) { + assert.Equal(t, 0.0, Shannon("")) +} + +func TestMaskKeyNormal(t *testing.T) { + assert.Equal(t, "sk-proj-...1234", MaskKey("sk-proj-abc1234")) +} + +func TestMaskKeyShort(t *testing.T) { + assert.Equal(t, "****", MaskKey("abc")) +} diff --git a/pkg/engine/finding.go b/pkg/engine/finding.go new file mode 100644 index 0000000..c2861d3 --- /dev/null +++ b/pkg/engine/finding.go @@ -0,0 +1,26 @@ +package engine + +import "time" + +// Finding represents a detected API key from the scanning pipeline. +// KeyValue holds the plaintext key -- the storage layer encrypts it before persisting. +type Finding struct { + ProviderName string + KeyValue string // full plaintext key + KeyMasked string // first8...last4 + Confidence string // "high", "medium", "low" + Source string // file path or description + SourceType string // "file", "dir", "git", "stdin", "url" + LineNumber int + Offset int64 + DetectedAt time.Time +} + +// MaskKey returns a masked representation: first 8 chars + "..." + last 4 chars. +// Returns "****" if the key is shorter than 12 characters. +func MaskKey(key string) string { + if len(key) < 12 { + return "****" + } + return key[:8] + "..." + key[len(key)-4:] +} diff --git a/pkg/types/chunk.go b/pkg/types/chunk.go new file mode 100644 index 0000000..9f10f78 --- /dev/null +++ b/pkg/types/chunk.go @@ -0,0 +1,10 @@ +package types + +// Chunk is a segment of file content passed through the scanning pipeline. +// Defined in pkg/types (not pkg/engine) so that pkg/engine/sources can use it +// without creating a circular import with pkg/engine. +type Chunk struct { + Data []byte // raw bytes + Source string // file path, URL, or description + Offset int64 // byte offset of this chunk within the source +}