From 46eec328d25eb7ed8a5dd99045c4549ef5f0e716 Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 23:55:24 +0300 Subject: [PATCH] feat(07-01): Importer interface and TruffleHog v3 JSON adapter - pkg/importer/importer.go: shared Importer interface (Name, Import) - pkg/importer/trufflehog.go: TruffleHogImporter with v3 JSON decoding, detector-name normalization (OpenAI/GithubV2/AWS -> canonical ids), SourceMetadata path+line extraction for Git/Filesystem/Github - pkg/importer/testdata/trufflehog-sample.json: 3-record fixture - pkg/importer/trufflehog_test.go: Name, Import, NormalizeName, EmptyArray, InvalidJSON tests -- all passing --- pkg/importer/importer.go | 24 +++ pkg/importer/testdata/trufflehog-sample.json | 57 ++++++ pkg/importer/trufflehog.go | 174 +++++++++++++++++++ pkg/importer/trufflehog_test.go | 128 ++++++++++++++ 4 files changed, 383 insertions(+) create mode 100644 pkg/importer/importer.go create mode 100644 pkg/importer/testdata/trufflehog-sample.json create mode 100644 pkg/importer/trufflehog.go create mode 100644 pkg/importer/trufflehog_test.go diff --git a/pkg/importer/importer.go b/pkg/importer/importer.go new file mode 100644 index 0000000..dad4817 --- /dev/null +++ b/pkg/importer/importer.go @@ -0,0 +1,24 @@ +// Package importer provides adapters that parse output from external secret +// scanners (TruffleHog, Gitleaks, ...) and normalize them into KeyHunter's +// engine.Finding model so they can be inserted into the unified storage layer. +package importer + +import ( + "io" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +// Importer parses output from an external secret scanner and returns +// normalized engine.Finding records. Implementations must be stateless +// and safe for reuse across calls. +type Importer interface { + // Name returns the short identifier of the source format + // (e.g. "trufflehog", "gitleaks"). Used by the CLI --format flag. + Name() string + + // Import reads scanner output from r and returns the normalized findings. + // Implementations should return a wrapped error on malformed input and an + // empty slice with nil error on empty input. + Import(r io.Reader) ([]engine.Finding, error) +} diff --git a/pkg/importer/testdata/trufflehog-sample.json b/pkg/importer/testdata/trufflehog-sample.json new file mode 100644 index 0000000..2aa0ac0 --- /dev/null +++ b/pkg/importer/testdata/trufflehog-sample.json @@ -0,0 +1,57 @@ +[ + { + "SourceID": 1, + "SourceName": "git-scan", + "SourceMetadata": { + "Data": { + "Git": { + "commit": "deadbeef1234", + "file": "src/config.py", + "repository": "https://github.com/example/repo", + "line": 42 + } + } + }, + "DetectorName": "OpenAI", + "DetectorType": 17, + "Verified": true, + "Raw": "sk-proj-abcdef1234567890abcdef", + "Redacted": "sk-proj-abcd...cdef", + "ExtraData": {} + }, + { + "SourceID": 2, + "SourceName": "fs-scan", + "SourceMetadata": { + "Data": { + "Filesystem": { + "file": "/tmp/leaked.env" + } + } + }, + "DetectorName": "AnthropicV2", + "DetectorType": 92, + "Verified": false, + "Raw": "sk-ant-api03-xxxxxxxxxxxxxxxx", + "Redacted": "sk-ant-api03-xxxx", + "ExtraData": {} + }, + { + "SourceID": 3, + "SourceName": "github-scan", + "SourceMetadata": { + "Data": { + "Github": { + "link": "https://github.com/foo/bar/blob/main/a.yml", + "repository": "https://github.com/foo/bar" + } + } + }, + "DetectorName": "AWS", + "DetectorType": 2, + "Verified": true, + "Raw": "AKIAIOSFODNN7EXAMPLE", + "Redacted": "AKIA****EXAMPLE", + "ExtraData": {} + } +] diff --git a/pkg/importer/trufflehog.go b/pkg/importer/trufflehog.go new file mode 100644 index 0000000..125853a --- /dev/null +++ b/pkg/importer/trufflehog.go @@ -0,0 +1,174 @@ +package importer + +import ( + "encoding/json" + "fmt" + "io" + "regexp" + "strings" + "time" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +// TruffleHogImporter parses TruffleHog v3 JSON output +// (`trufflehog ... --json`) into engine.Finding records. +// +// TruffleHog v3 emits a JSON array whose elements describe each detected +// secret with detector metadata, verification status, and a nested +// SourceMetadata object whose shape depends on the scan source (git, +// filesystem, github, ...). See 07-CONTEXT.md for the field decisions. +type TruffleHogImporter struct{} + +// trufflehogRecord mirrors the v3 JSON schema. Fields we do not consume +// (DetectorType numeric ID, ExtraData blob) are kept as raw JSON so +// decoding does not fail on unknown shapes. +type trufflehogRecord struct { + SourceID int `json:"SourceID"` + SourceName string `json:"SourceName"` + SourceMetadata json.RawMessage `json:"SourceMetadata"` + DetectorName string `json:"DetectorName"` + DetectorType int `json:"DetectorType"` + Verified bool `json:"Verified"` + Raw string `json:"Raw"` + Redacted string `json:"Redacted"` + ExtraData json.RawMessage `json:"ExtraData"` +} + +// tfhSourceMetadata captures the subset of SourceMetadata.Data we extract a +// source path / line number from. All sub-objects are pointers so we can tell +// "not present" from "empty". +type tfhSourceMetadata struct { + Data struct { + Git *struct { + File string `json:"file"` + Repository string `json:"repository"` + Commit string `json:"commit"` + Line int `json:"line"` + } `json:"Git"` + Filesystem *struct { + File string `json:"file"` + } `json:"Filesystem"` + Github *struct { + File string `json:"file"` + Link string `json:"link"` + Repository string `json:"repository"` + } `json:"Github"` + } `json:"Data"` +} + +// tfhVersionSuffix strips trailing version markers from detector names +// ("GithubV2" -> "Github", "AnthropicV2" -> "Anthropic"). +var tfhVersionSuffix = regexp.MustCompile(`v\d+$`) + +// tfhAliases maps known lowercase TruffleHog detector names to KeyHunter's +// canonical provider identifiers. Entries that are the same on both sides +// are listed explicitly so intent is clear. +var tfhAliases = map[string]string{ + "aws": "aws", + "gcp": "gcp", + "openai": "openai", + "anthropic": "anthropic", + "huggingface": "huggingface", + "github": "github", +} + +// Name implements Importer. +func (TruffleHogImporter) Name() string { return "trufflehog" } + +// Import decodes a TruffleHog v3 JSON array from r and returns the findings +// in the same order they appear in the input. Records with an empty Raw +// value are skipped silently because they carry no usable key material. +func (TruffleHogImporter) Import(r io.Reader) ([]engine.Finding, error) { + var records []trufflehogRecord + if err := json.NewDecoder(r).Decode(&records); err != nil { + return nil, fmt.Errorf("decoding trufflehog json: %w", err) + } + + findings := make([]engine.Finding, 0, len(records)) + now := time.Now() + + for _, rec := range records { + if rec.Raw == "" { + continue + } + + source, line := extractSourcePath(rec.SourceMetadata) + if source == "" { + source = rec.SourceName + } + + confidence := "medium" + verifyStatus := "unverified" + if rec.Verified { + confidence = "high" + verifyStatus = "live" + } + + findings = append(findings, engine.Finding{ + ProviderName: normalizeTruffleHogName(rec.DetectorName), + KeyValue: rec.Raw, + KeyMasked: engine.MaskKey(rec.Raw), + Confidence: confidence, + Source: source, + SourceType: "import:trufflehog", + LineNumber: line, + DetectedAt: now, + Verified: rec.Verified, + VerifyStatus: verifyStatus, + }) + } + + return findings, nil +} + +// normalizeTruffleHogName converts a TruffleHog detector name +// ("OpenAI", "GithubV2", "AWS") to the lowercase KeyHunter provider id. +// Unknown detectors fall through as their lowercased, de-versioned form. +func normalizeTruffleHogName(detector string) string { + lowered := strings.ToLower(strings.TrimSpace(detector)) + lowered = tfhVersionSuffix.ReplaceAllString(lowered, "") + if alias, ok := tfhAliases[lowered]; ok { + return alias + } + return lowered +} + +// extractSourcePath walks SourceMetadata.Data in priority order and returns +// the first non-empty location string together with a line number when one +// is available. Any unmarshal error is non-fatal and yields ("", 0). +func extractSourcePath(meta json.RawMessage) (string, int) { + if len(meta) == 0 { + return "", 0 + } + var sm tfhSourceMetadata + if err := json.Unmarshal(meta, &sm); err != nil { + return "", 0 + } + + line := 0 + if sm.Data.Git != nil { + line = sm.Data.Git.Line + if sm.Data.Git.File != "" { + return sm.Data.Git.File, line + } + } + if sm.Data.Filesystem != nil && sm.Data.Filesystem.File != "" { + return sm.Data.Filesystem.File, line + } + if sm.Data.Github != nil { + if sm.Data.Github.File != "" { + return sm.Data.Github.File, line + } + if sm.Data.Github.Link != "" { + return sm.Data.Github.Link, line + } + if sm.Data.Github.Repository != "" { + return sm.Data.Github.Repository, line + } + } + if sm.Data.Git != nil && sm.Data.Git.Repository != "" { + return sm.Data.Git.Repository, line + } + return "", line +} diff --git a/pkg/importer/trufflehog_test.go b/pkg/importer/trufflehog_test.go new file mode 100644 index 0000000..b75f1e5 --- /dev/null +++ b/pkg/importer/trufflehog_test.go @@ -0,0 +1,128 @@ +package importer + +import ( + "os" + "strings" + "testing" +) + +func TestTruffleHogImporter_Name(t *testing.T) { + var imp TruffleHogImporter + if got := imp.Name(); got != "trufflehog" { + t.Fatalf("Name() = %q, want %q", got, "trufflehog") + } +} + +func TestTruffleHogImporter_Import(t *testing.T) { + f, err := os.Open("testdata/trufflehog-sample.json") + if err != nil { + t.Fatalf("open fixture: %v", err) + } + defer f.Close() + + var imp TruffleHogImporter + findings, err := imp.Import(f) + if err != nil { + t.Fatalf("Import returned error: %v", err) + } + if len(findings) != 3 { + t.Fatalf("expected 3 findings, got %d", len(findings)) + } + + // Record 1: OpenAI / Git / verified. + f0 := findings[0] + if f0.ProviderName != "openai" { + t.Errorf("findings[0].ProviderName = %q, want openai", f0.ProviderName) + } + if f0.Confidence != "high" { + t.Errorf("findings[0].Confidence = %q, want high", f0.Confidence) + } + if !f0.Verified { + t.Error("findings[0].Verified = false, want true") + } + if f0.VerifyStatus != "live" { + t.Errorf("findings[0].VerifyStatus = %q, want live", f0.VerifyStatus) + } + if f0.Source != "src/config.py" { + t.Errorf("findings[0].Source = %q, want src/config.py", f0.Source) + } + if f0.LineNumber != 42 { + t.Errorf("findings[0].LineNumber = %d, want 42", f0.LineNumber) + } + if f0.SourceType != "import:trufflehog" { + t.Errorf("findings[0].SourceType = %q, want import:trufflehog", f0.SourceType) + } + if f0.KeyValue != "sk-proj-abcdef1234567890abcdef" { + t.Errorf("findings[0].KeyValue unexpected: %q", f0.KeyValue) + } + if f0.KeyMasked == "" || f0.KeyMasked == f0.KeyValue { + t.Errorf("findings[0].KeyMasked not populated: %q", f0.KeyMasked) + } + + // Record 2: AnthropicV2 / Filesystem / unverified. + f1 := findings[1] + if f1.ProviderName != "anthropic" { + t.Errorf("findings[1].ProviderName = %q, want anthropic", f1.ProviderName) + } + if f1.Confidence != "medium" { + t.Errorf("findings[1].Confidence = %q, want medium", f1.Confidence) + } + if f1.Verified { + t.Error("findings[1].Verified = true, want false") + } + if f1.VerifyStatus != "unverified" { + t.Errorf("findings[1].VerifyStatus = %q, want unverified", f1.VerifyStatus) + } + if f1.Source != "/tmp/leaked.env" { + t.Errorf("findings[1].Source = %q, want /tmp/leaked.env", f1.Source) + } + + // Record 3: AWS / Github link. + f2 := findings[2] + if f2.ProviderName != "aws" { + t.Errorf("findings[2].ProviderName = %q, want aws", f2.ProviderName) + } + if !f2.Verified { + t.Error("findings[2].Verified = false, want true") + } + if f2.Source != "https://github.com/foo/bar/blob/main/a.yml" { + t.Errorf("findings[2].Source = %q, want github link", f2.Source) + } +} + +func TestTruffleHogImporter_NormalizeName(t *testing.T) { + cases := []struct { + in, want string + }{ + {"OpenAI", "openai"}, + {"GithubV2", "github"}, + {"AnthropicV2", "anthropic"}, + {"AWS", "aws"}, + {"GCP", "gcp"}, + {"HuggingFace", "huggingface"}, + {"UnknownDetector", "unknowndetector"}, + } + for _, c := range cases { + if got := normalizeTruffleHogName(c.in); got != c.want { + t.Errorf("normalizeTruffleHogName(%q) = %q, want %q", c.in, got, c.want) + } + } +} + +func TestTruffleHogImporter_EmptyArray(t *testing.T) { + var imp TruffleHogImporter + findings, err := imp.Import(strings.NewReader("[]")) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(findings) != 0 { + t.Fatalf("expected 0 findings, got %d", len(findings)) + } +} + +func TestTruffleHogImporter_InvalidJSON(t *testing.T) { + var imp TruffleHogImporter + if _, err := imp.Import(strings.NewReader("not json")); err == nil { + t.Fatal("expected error for invalid JSON, got nil") + } +}