From 83640ac200edb2c919a933a3f3984aa8cd5b36ec Mon Sep 17 00:00:00 2001 From: salvacybersec Date: Sun, 5 Apr 2026 23:55:36 +0300 Subject: [PATCH] feat(07-02): add Gitleaks JSON + CSV importers - GitleaksImporter parses native JSON array output to []engine.Finding - GitleaksCSVImporter parses CSV with header-based column resolution - normalizeGitleaksRuleID strips suffixes (-api-key, -access-token, ...) - Shared buildGitleaksFinding helper keeps JSON/CSV paths in lockstep - Test fixtures + 8 tests covering happy path, empty, invalid, symlink fallback --- pkg/importer/gitleaks.go | 153 ++++++++++++++++++++ pkg/importer/gitleaks_test.go | 159 +++++++++++++++++++++ pkg/importer/testdata/gitleaks-sample.csv | 4 + pkg/importer/testdata/gitleaks-sample.json | 62 ++++++++ 4 files changed, 378 insertions(+) create mode 100644 pkg/importer/gitleaks.go create mode 100644 pkg/importer/gitleaks_test.go create mode 100644 pkg/importer/testdata/gitleaks-sample.csv create mode 100644 pkg/importer/testdata/gitleaks-sample.json diff --git a/pkg/importer/gitleaks.go b/pkg/importer/gitleaks.go new file mode 100644 index 0000000..3b637ab --- /dev/null +++ b/pkg/importer/gitleaks.go @@ -0,0 +1,153 @@ +package importer + +import ( + "encoding/csv" + "encoding/json" + "fmt" + "io" + "strconv" + "strings" + "time" + + "github.com/salvacybersec/keyhunter/pkg/engine" +) + +// GitleaksImporter parses Gitleaks native JSON output (an array of finding +// records) and normalizes each record into an engine.Finding. +type GitleaksImporter struct{} + +// GitleaksCSVImporter parses Gitleaks CSV output with a mandatory header row. +// Columns are resolved by header name so Gitleaks version drift in column +// order does not break ingestion. +type GitleaksCSVImporter struct{} + +// gitleaksRecord mirrors the JSON object emitted by `gitleaks detect -f json`. +type gitleaksRecord struct { + Description string `json:"Description"` + StartLine int `json:"StartLine"` + EndLine int `json:"EndLine"` + StartColumn int `json:"StartColumn"` + EndColumn int `json:"EndColumn"` + Match string `json:"Match"` + Secret string `json:"Secret"` + File string `json:"File"` + SymlinkFile string `json:"SymlinkFile"` + Commit string `json:"Commit"` + Entropy float64 `json:"Entropy"` + Author string `json:"Author"` + Email string `json:"Email"` + Date string `json:"Date"` + Message string `json:"Message"` + Tags []string `json:"Tags"` + RuleID string `json:"RuleID"` + Fingerprint string `json:"Fingerprint"` +} + +// Name returns the importer identifier used by the CLI --format flag. +func (GitleaksImporter) Name() string { return "gitleaks" } + +// Import decodes a Gitleaks JSON array from r and returns the normalized +// findings. An empty array returns (nil, nil). +func (GitleaksImporter) Import(r io.Reader) ([]engine.Finding, error) { + var records []gitleaksRecord + dec := json.NewDecoder(r) + if err := dec.Decode(&records); err != nil { + return nil, fmt.Errorf("gitleaks: decode json: %w", err) + } + if len(records) == 0 { + return nil, nil + } + findings := make([]engine.Finding, 0, len(records)) + for _, rec := range records { + findings = append(findings, buildGitleaksFinding(rec.RuleID, rec.Secret, rec.File, rec.SymlinkFile, rec.StartLine)) + } + return findings, nil +} + +// Name returns the importer identifier used by the CLI --format flag. +func (GitleaksCSVImporter) Name() string { return "gitleaks-csv" } + +// Import decodes Gitleaks CSV output with a mandatory header row. Columns are +// resolved by header name; missing optional fields default to zero values. +// A header-only input returns (nil, nil). +func (GitleaksCSVImporter) Import(r io.Reader) ([]engine.Finding, error) { + reader := csv.NewReader(r) + reader.FieldsPerRecord = -1 // tolerate ragged rows + + header, err := reader.Read() + if err == io.EOF { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("gitleaks-csv: read header: %w", err) + } + + index := make(map[string]int, len(header)) + for i, col := range header { + index[strings.TrimSpace(col)] = i + } + + get := func(row []string, name string) string { + i, ok := index[name] + if !ok || i >= len(row) { + return "" + } + return row[i] + } + + var findings []engine.Finding + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + return nil, fmt.Errorf("gitleaks-csv: read row: %w", err) + } + startLine, _ := strconv.Atoi(get(row, "StartLine")) + findings = append(findings, buildGitleaksFinding( + get(row, "RuleID"), + get(row, "Secret"), + get(row, "File"), + get(row, "SymlinkFile"), + startLine, + )) + } + return findings, nil +} + +// normalizeGitleaksRuleID maps a Gitleaks rule identifier to a short +// KeyHunter-style provider name. It lowercases the input and strips common +// trailing tokens ("-api-key", "-access-token", ...). Unknown patterns are +// returned lowercased but otherwise unchanged. +func normalizeGitleaksRuleID(id string) string { + id = strings.ToLower(strings.TrimSpace(id)) + suffixes := []string{"-api-key", "-access-token", "-secret-key", "-secret", "-token", "-key"} + for _, s := range suffixes { + if strings.HasSuffix(id, s) { + return strings.TrimSuffix(id, s) + } + } + return id +} + +// buildGitleaksFinding assembles an engine.Finding from fields common to both +// the JSON and CSV Gitleaks code paths so the two importers stay in lockstep. +func buildGitleaksFinding(ruleID, secret, file, symlink string, startLine int) engine.Finding { + source := file + if source == "" { + source = symlink + } + return engine.Finding{ + ProviderName: normalizeGitleaksRuleID(ruleID), + KeyValue: secret, + KeyMasked: engine.MaskKey(secret), + Confidence: "medium", + Source: source, + SourceType: "import:gitleaks", + LineNumber: startLine, + DetectedAt: time.Now(), + Verified: false, + VerifyStatus: "unverified", + } +} diff --git a/pkg/importer/gitleaks_test.go b/pkg/importer/gitleaks_test.go new file mode 100644 index 0000000..f05bb91 --- /dev/null +++ b/pkg/importer/gitleaks_test.go @@ -0,0 +1,159 @@ +package importer + +import ( + "bytes" + "os" + "strings" + "testing" +) + +func loadFixture(t *testing.T, name string) []byte { + t.Helper() + data, err := os.ReadFile("testdata/" + name) + if err != nil { + t.Fatalf("read fixture %s: %v", name, err) + } + return data +} + +func TestGitleaksImporter_Name(t *testing.T) { + if (GitleaksImporter{}).Name() != "gitleaks" { + t.Errorf("GitleaksImporter.Name() = %q, want %q", (GitleaksImporter{}).Name(), "gitleaks") + } + if (GitleaksCSVImporter{}).Name() != "gitleaks-csv" { + t.Errorf("GitleaksCSVImporter.Name() = %q, want %q", (GitleaksCSVImporter{}).Name(), "gitleaks-csv") + } +} + +func TestGitleaksImporter_JSON(t *testing.T) { + data := loadFixture(t, "gitleaks-sample.json") + findings, err := (GitleaksImporter{}).Import(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Import: %v", err) + } + if len(findings) != 3 { + t.Fatalf("expected 3 findings, got %d", len(findings)) + } + if findings[0].ProviderName != "openai" { + t.Errorf("findings[0].ProviderName = %q, want %q", findings[0].ProviderName, "openai") + } + if findings[0].KeyValue != "sk-proj-1234567890abcdef1234" { + t.Errorf("findings[0].KeyValue mismatch: %q", findings[0].KeyValue) + } + if findings[0].Source != "config/app.yml" { + t.Errorf("findings[0].Source = %q", findings[0].Source) + } + if findings[0].LineNumber != 12 { + t.Errorf("findings[0].LineNumber = %d, want 12", findings[0].LineNumber) + } + if findings[0].SourceType != "import:gitleaks" { + t.Errorf("findings[0].SourceType = %q", findings[0].SourceType) + } + if findings[0].Confidence != "medium" { + t.Errorf("findings[0].Confidence = %q, want medium", findings[0].Confidence) + } + if findings[0].VerifyStatus != "unverified" { + t.Errorf("findings[0].VerifyStatus = %q, want unverified", findings[0].VerifyStatus) + } + if findings[0].Verified { + t.Errorf("findings[0].Verified should be false") + } + if findings[0].KeyMasked == "" { + t.Errorf("findings[0].KeyMasked should be set") + } + if findings[1].ProviderName != "aws" { + t.Errorf("findings[1].ProviderName = %q, want aws", findings[1].ProviderName) + } + if findings[1].LineNumber != 55 { + t.Errorf("findings[1].LineNumber = %d, want 55", findings[1].LineNumber) + } + if findings[2].ProviderName != "generic" { + t.Errorf("findings[2].ProviderName = %q, want generic", findings[2].ProviderName) + } +} + +func TestGitleaksImporter_CSV(t *testing.T) { + data := loadFixture(t, "gitleaks-sample.csv") + findings, err := (GitleaksCSVImporter{}).Import(bytes.NewReader(data)) + if err != nil { + t.Fatalf("Import: %v", err) + } + if len(findings) != 3 { + t.Fatalf("expected 3 findings, got %d", len(findings)) + } + if findings[0].ProviderName != "openai" { + t.Errorf("findings[0].ProviderName = %q, want openai", findings[0].ProviderName) + } + if findings[0].KeyValue != "sk-proj-1234567890abcdef1234" { + t.Errorf("findings[0].KeyValue = %q", findings[0].KeyValue) + } + if findings[0].Source != "config/app.yml" { + t.Errorf("findings[0].Source = %q", findings[0].Source) + } + if findings[0].LineNumber != 12 { + t.Errorf("findings[0].LineNumber = %d, want 12", findings[0].LineNumber) + } + if findings[1].ProviderName != "aws" { + t.Errorf("findings[1].ProviderName = %q, want aws", findings[1].ProviderName) + } + if findings[2].ProviderName != "generic" { + t.Errorf("findings[2].ProviderName = %q, want generic", findings[2].ProviderName) + } +} + +func TestGitleaksImporter_NormalizeRuleID(t *testing.T) { + cases := []struct{ in, out string }{ + {"openai-api-key", "openai"}, + {"aws-access-token", "aws"}, + {"anthropic-api-key", "anthropic"}, + {"generic-api-key", "generic"}, + {"github-pat", "github-pat"}, + {"Some-Secret", "some"}, + {"AWS-Access-Token", "aws"}, + } + for _, c := range cases { + got := normalizeGitleaksRuleID(c.in) + if got != c.out { + t.Errorf("normalizeGitleaksRuleID(%q) = %q, want %q", c.in, got, c.out) + } + } +} + +func TestGitleaksImporter_EmptyArray(t *testing.T) { + findings, err := (GitleaksImporter{}).Import(strings.NewReader("[]")) + if err != nil { + t.Fatalf("Import: %v", err) + } + if len(findings) != 0 { + t.Errorf("expected 0 findings, got %d", len(findings)) + } +} + +func TestGitleaksImporter_EmptyCSV(t *testing.T) { + header := "RuleID,Commit,File,SymlinkFile,Secret,Match,StartLine,EndLine,StartColumn,EndColumn,Author,Message,Date,Email,Fingerprint,Tags\n" + findings, err := (GitleaksCSVImporter{}).Import(strings.NewReader(header)) + if err != nil { + t.Fatalf("Import: %v", err) + } + if len(findings) != 0 { + t.Errorf("expected 0 findings, got %d", len(findings)) + } +} + +func TestGitleaksImporter_InvalidJSON(t *testing.T) { + _, err := (GitleaksImporter{}).Import(strings.NewReader("{not json")) + if err == nil { + t.Errorf("expected error for invalid JSON") + } +} + +func TestGitleaksImporter_SymlinkFallback(t *testing.T) { + jsonInput := `[{"RuleID":"openai-api-key","Secret":"sk-proj-1234567890abcdef1234","File":"","SymlinkFile":"link/config.yml","StartLine":1}]` + findings, err := (GitleaksImporter{}).Import(strings.NewReader(jsonInput)) + if err != nil { + t.Fatalf("Import: %v", err) + } + if len(findings) != 1 || findings[0].Source != "link/config.yml" { + t.Errorf("expected symlink fallback source, got %+v", findings) + } +} diff --git a/pkg/importer/testdata/gitleaks-sample.csv b/pkg/importer/testdata/gitleaks-sample.csv new file mode 100644 index 0000000..d61eaf3 --- /dev/null +++ b/pkg/importer/testdata/gitleaks-sample.csv @@ -0,0 +1,4 @@ +RuleID,Commit,File,SymlinkFile,Secret,Match,StartLine,EndLine,StartColumn,EndColumn,Author,Message,Date,Email,Fingerprint,Tags +openai-api-key,abc123,config/app.yml,,sk-proj-1234567890abcdef1234,key: sk-proj-1234567890abcdef1234,12,12,10,60,dev,add config,2026-04-01T12:00:00Z,dev@example.com,abc123:config/app.yml:openai-api-key:12,"key,openai" +aws-access-token,def456,terraform/main.tf,,AKIAIOSFODNN7EXAMPLE,access_key = AKIAIOSFODNN7EXAMPLE,55,55,20,40,ops,tf update,2026-04-02T09:30:00Z,ops@example.com,def456:terraform/main.tf:aws-access-token:55,"key,aws" +generic-api-key,ghi789,scripts/deploy.sh,,xoxp-abcdefghijklmnopqrstuvwxyz,TOKEN=xoxp-abcdefghijklmnopqrstuvwxyz,3,3,8,50,dev,deploy script,2026-04-03T15:45:00Z,dev@example.com,ghi789:scripts/deploy.sh:generic-api-key:3,"key,generic" diff --git a/pkg/importer/testdata/gitleaks-sample.json b/pkg/importer/testdata/gitleaks-sample.json new file mode 100644 index 0000000..0275143 --- /dev/null +++ b/pkg/importer/testdata/gitleaks-sample.json @@ -0,0 +1,62 @@ +[ + { + "Description": "OpenAI API Key", + "StartLine": 12, + "EndLine": 12, + "StartColumn": 10, + "EndColumn": 60, + "Match": "key: sk-proj-1234567890abcdef1234", + "Secret": "sk-proj-1234567890abcdef1234", + "File": "config/app.yml", + "SymlinkFile": "", + "Commit": "abc123", + "Entropy": 4.5, + "Author": "dev", + "Email": "dev@example.com", + "Date": "2026-04-01T12:00:00Z", + "Message": "add config", + "Tags": ["key", "openai"], + "RuleID": "openai-api-key", + "Fingerprint": "abc123:config/app.yml:openai-api-key:12" + }, + { + "Description": "AWS Access Token", + "StartLine": 55, + "EndLine": 55, + "StartColumn": 20, + "EndColumn": 40, + "Match": "access_key = AKIAIOSFODNN7EXAMPLE", + "Secret": "AKIAIOSFODNN7EXAMPLE", + "File": "terraform/main.tf", + "SymlinkFile": "", + "Commit": "def456", + "Entropy": 4.2, + "Author": "ops", + "Email": "ops@example.com", + "Date": "2026-04-02T09:30:00Z", + "Message": "tf update", + "Tags": ["key", "aws"], + "RuleID": "aws-access-token", + "Fingerprint": "def456:terraform/main.tf:aws-access-token:55" + }, + { + "Description": "Generic API Key", + "StartLine": 3, + "EndLine": 3, + "StartColumn": 8, + "EndColumn": 50, + "Match": "TOKEN=xoxp-abcdefghijklmnopqrstuvwxyz", + "Secret": "xoxp-abcdefghijklmnopqrstuvwxyz", + "File": "scripts/deploy.sh", + "SymlinkFile": "", + "Commit": "ghi789", + "Entropy": 3.8, + "Author": "dev", + "Email": "dev@example.com", + "Date": "2026-04-03T15:45:00Z", + "Message": "deploy script", + "Tags": ["key", "generic"], + "RuleID": "generic-api-key", + "Fingerprint": "ghi789:scripts/deploy.sh:generic-api-key:3" + } +]