feat(07-02): add Gitleaks JSON + CSV importers

- GitleaksImporter parses native JSON array output to []engine.Finding
- GitleaksCSVImporter parses CSV with header-based column resolution
- normalizeGitleaksRuleID strips suffixes (-api-key, -access-token, ...)
- Shared buildGitleaksFinding helper keeps JSON/CSV paths in lockstep
- Test fixtures + 8 tests covering happy path, empty, invalid, symlink fallback
This commit is contained in:
salvacybersec
2026-04-05 23:55:36 +03:00
parent 46eec328d2
commit 83640ac200
4 changed files with 378 additions and 0 deletions

153
pkg/importer/gitleaks.go Normal file
View File

@@ -0,0 +1,153 @@
package importer
import (
"encoding/csv"
"encoding/json"
"fmt"
"io"
"strconv"
"strings"
"time"
"github.com/salvacybersec/keyhunter/pkg/engine"
)
// GitleaksImporter parses Gitleaks native JSON output (an array of finding
// records) and normalizes each record into an engine.Finding.
type GitleaksImporter struct{}
// GitleaksCSVImporter parses Gitleaks CSV output with a mandatory header row.
// Columns are resolved by header name so Gitleaks version drift in column
// order does not break ingestion.
type GitleaksCSVImporter struct{}
// gitleaksRecord mirrors the JSON object emitted by `gitleaks detect -f json`.
type gitleaksRecord struct {
Description string `json:"Description"`
StartLine int `json:"StartLine"`
EndLine int `json:"EndLine"`
StartColumn int `json:"StartColumn"`
EndColumn int `json:"EndColumn"`
Match string `json:"Match"`
Secret string `json:"Secret"`
File string `json:"File"`
SymlinkFile string `json:"SymlinkFile"`
Commit string `json:"Commit"`
Entropy float64 `json:"Entropy"`
Author string `json:"Author"`
Email string `json:"Email"`
Date string `json:"Date"`
Message string `json:"Message"`
Tags []string `json:"Tags"`
RuleID string `json:"RuleID"`
Fingerprint string `json:"Fingerprint"`
}
// Name returns the importer identifier used by the CLI --format flag.
func (GitleaksImporter) Name() string { return "gitleaks" }
// Import decodes a Gitleaks JSON array from r and returns the normalized
// findings. An empty array returns (nil, nil).
func (GitleaksImporter) Import(r io.Reader) ([]engine.Finding, error) {
var records []gitleaksRecord
dec := json.NewDecoder(r)
if err := dec.Decode(&records); err != nil {
return nil, fmt.Errorf("gitleaks: decode json: %w", err)
}
if len(records) == 0 {
return nil, nil
}
findings := make([]engine.Finding, 0, len(records))
for _, rec := range records {
findings = append(findings, buildGitleaksFinding(rec.RuleID, rec.Secret, rec.File, rec.SymlinkFile, rec.StartLine))
}
return findings, nil
}
// Name returns the importer identifier used by the CLI --format flag.
func (GitleaksCSVImporter) Name() string { return "gitleaks-csv" }
// Import decodes Gitleaks CSV output with a mandatory header row. Columns are
// resolved by header name; missing optional fields default to zero values.
// A header-only input returns (nil, nil).
func (GitleaksCSVImporter) Import(r io.Reader) ([]engine.Finding, error) {
reader := csv.NewReader(r)
reader.FieldsPerRecord = -1 // tolerate ragged rows
header, err := reader.Read()
if err == io.EOF {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("gitleaks-csv: read header: %w", err)
}
index := make(map[string]int, len(header))
for i, col := range header {
index[strings.TrimSpace(col)] = i
}
get := func(row []string, name string) string {
i, ok := index[name]
if !ok || i >= len(row) {
return ""
}
return row[i]
}
var findings []engine.Finding
for {
row, err := reader.Read()
if err == io.EOF {
break
}
if err != nil {
return nil, fmt.Errorf("gitleaks-csv: read row: %w", err)
}
startLine, _ := strconv.Atoi(get(row, "StartLine"))
findings = append(findings, buildGitleaksFinding(
get(row, "RuleID"),
get(row, "Secret"),
get(row, "File"),
get(row, "SymlinkFile"),
startLine,
))
}
return findings, nil
}
// normalizeGitleaksRuleID maps a Gitleaks rule identifier to a short
// KeyHunter-style provider name. It lowercases the input and strips common
// trailing tokens ("-api-key", "-access-token", ...). Unknown patterns are
// returned lowercased but otherwise unchanged.
func normalizeGitleaksRuleID(id string) string {
id = strings.ToLower(strings.TrimSpace(id))
suffixes := []string{"-api-key", "-access-token", "-secret-key", "-secret", "-token", "-key"}
for _, s := range suffixes {
if strings.HasSuffix(id, s) {
return strings.TrimSuffix(id, s)
}
}
return id
}
// buildGitleaksFinding assembles an engine.Finding from fields common to both
// the JSON and CSV Gitleaks code paths so the two importers stay in lockstep.
func buildGitleaksFinding(ruleID, secret, file, symlink string, startLine int) engine.Finding {
source := file
if source == "" {
source = symlink
}
return engine.Finding{
ProviderName: normalizeGitleaksRuleID(ruleID),
KeyValue: secret,
KeyMasked: engine.MaskKey(secret),
Confidence: "medium",
Source: source,
SourceType: "import:gitleaks",
LineNumber: startLine,
DetectedAt: time.Now(),
Verified: false,
VerifyStatus: "unverified",
}
}

View File

@@ -0,0 +1,159 @@
package importer
import (
"bytes"
"os"
"strings"
"testing"
)
func loadFixture(t *testing.T, name string) []byte {
t.Helper()
data, err := os.ReadFile("testdata/" + name)
if err != nil {
t.Fatalf("read fixture %s: %v", name, err)
}
return data
}
func TestGitleaksImporter_Name(t *testing.T) {
if (GitleaksImporter{}).Name() != "gitleaks" {
t.Errorf("GitleaksImporter.Name() = %q, want %q", (GitleaksImporter{}).Name(), "gitleaks")
}
if (GitleaksCSVImporter{}).Name() != "gitleaks-csv" {
t.Errorf("GitleaksCSVImporter.Name() = %q, want %q", (GitleaksCSVImporter{}).Name(), "gitleaks-csv")
}
}
func TestGitleaksImporter_JSON(t *testing.T) {
data := loadFixture(t, "gitleaks-sample.json")
findings, err := (GitleaksImporter{}).Import(bytes.NewReader(data))
if err != nil {
t.Fatalf("Import: %v", err)
}
if len(findings) != 3 {
t.Fatalf("expected 3 findings, got %d", len(findings))
}
if findings[0].ProviderName != "openai" {
t.Errorf("findings[0].ProviderName = %q, want %q", findings[0].ProviderName, "openai")
}
if findings[0].KeyValue != "sk-proj-1234567890abcdef1234" {
t.Errorf("findings[0].KeyValue mismatch: %q", findings[0].KeyValue)
}
if findings[0].Source != "config/app.yml" {
t.Errorf("findings[0].Source = %q", findings[0].Source)
}
if findings[0].LineNumber != 12 {
t.Errorf("findings[0].LineNumber = %d, want 12", findings[0].LineNumber)
}
if findings[0].SourceType != "import:gitleaks" {
t.Errorf("findings[0].SourceType = %q", findings[0].SourceType)
}
if findings[0].Confidence != "medium" {
t.Errorf("findings[0].Confidence = %q, want medium", findings[0].Confidence)
}
if findings[0].VerifyStatus != "unverified" {
t.Errorf("findings[0].VerifyStatus = %q, want unverified", findings[0].VerifyStatus)
}
if findings[0].Verified {
t.Errorf("findings[0].Verified should be false")
}
if findings[0].KeyMasked == "" {
t.Errorf("findings[0].KeyMasked should be set")
}
if findings[1].ProviderName != "aws" {
t.Errorf("findings[1].ProviderName = %q, want aws", findings[1].ProviderName)
}
if findings[1].LineNumber != 55 {
t.Errorf("findings[1].LineNumber = %d, want 55", findings[1].LineNumber)
}
if findings[2].ProviderName != "generic" {
t.Errorf("findings[2].ProviderName = %q, want generic", findings[2].ProviderName)
}
}
func TestGitleaksImporter_CSV(t *testing.T) {
data := loadFixture(t, "gitleaks-sample.csv")
findings, err := (GitleaksCSVImporter{}).Import(bytes.NewReader(data))
if err != nil {
t.Fatalf("Import: %v", err)
}
if len(findings) != 3 {
t.Fatalf("expected 3 findings, got %d", len(findings))
}
if findings[0].ProviderName != "openai" {
t.Errorf("findings[0].ProviderName = %q, want openai", findings[0].ProviderName)
}
if findings[0].KeyValue != "sk-proj-1234567890abcdef1234" {
t.Errorf("findings[0].KeyValue = %q", findings[0].KeyValue)
}
if findings[0].Source != "config/app.yml" {
t.Errorf("findings[0].Source = %q", findings[0].Source)
}
if findings[0].LineNumber != 12 {
t.Errorf("findings[0].LineNumber = %d, want 12", findings[0].LineNumber)
}
if findings[1].ProviderName != "aws" {
t.Errorf("findings[1].ProviderName = %q, want aws", findings[1].ProviderName)
}
if findings[2].ProviderName != "generic" {
t.Errorf("findings[2].ProviderName = %q, want generic", findings[2].ProviderName)
}
}
func TestGitleaksImporter_NormalizeRuleID(t *testing.T) {
cases := []struct{ in, out string }{
{"openai-api-key", "openai"},
{"aws-access-token", "aws"},
{"anthropic-api-key", "anthropic"},
{"generic-api-key", "generic"},
{"github-pat", "github-pat"},
{"Some-Secret", "some"},
{"AWS-Access-Token", "aws"},
}
for _, c := range cases {
got := normalizeGitleaksRuleID(c.in)
if got != c.out {
t.Errorf("normalizeGitleaksRuleID(%q) = %q, want %q", c.in, got, c.out)
}
}
}
func TestGitleaksImporter_EmptyArray(t *testing.T) {
findings, err := (GitleaksImporter{}).Import(strings.NewReader("[]"))
if err != nil {
t.Fatalf("Import: %v", err)
}
if len(findings) != 0 {
t.Errorf("expected 0 findings, got %d", len(findings))
}
}
func TestGitleaksImporter_EmptyCSV(t *testing.T) {
header := "RuleID,Commit,File,SymlinkFile,Secret,Match,StartLine,EndLine,StartColumn,EndColumn,Author,Message,Date,Email,Fingerprint,Tags\n"
findings, err := (GitleaksCSVImporter{}).Import(strings.NewReader(header))
if err != nil {
t.Fatalf("Import: %v", err)
}
if len(findings) != 0 {
t.Errorf("expected 0 findings, got %d", len(findings))
}
}
func TestGitleaksImporter_InvalidJSON(t *testing.T) {
_, err := (GitleaksImporter{}).Import(strings.NewReader("{not json"))
if err == nil {
t.Errorf("expected error for invalid JSON")
}
}
func TestGitleaksImporter_SymlinkFallback(t *testing.T) {
jsonInput := `[{"RuleID":"openai-api-key","Secret":"sk-proj-1234567890abcdef1234","File":"","SymlinkFile":"link/config.yml","StartLine":1}]`
findings, err := (GitleaksImporter{}).Import(strings.NewReader(jsonInput))
if err != nil {
t.Fatalf("Import: %v", err)
}
if len(findings) != 1 || findings[0].Source != "link/config.yml" {
t.Errorf("expected symlink fallback source, got %+v", findings)
}
}

View File

@@ -0,0 +1,4 @@
RuleID,Commit,File,SymlinkFile,Secret,Match,StartLine,EndLine,StartColumn,EndColumn,Author,Message,Date,Email,Fingerprint,Tags
openai-api-key,abc123,config/app.yml,,sk-proj-1234567890abcdef1234,key: sk-proj-1234567890abcdef1234,12,12,10,60,dev,add config,2026-04-01T12:00:00Z,dev@example.com,abc123:config/app.yml:openai-api-key:12,"key,openai"
aws-access-token,def456,terraform/main.tf,,AKIAIOSFODNN7EXAMPLE,access_key = AKIAIOSFODNN7EXAMPLE,55,55,20,40,ops,tf update,2026-04-02T09:30:00Z,ops@example.com,def456:terraform/main.tf:aws-access-token:55,"key,aws"
generic-api-key,ghi789,scripts/deploy.sh,,xoxp-abcdefghijklmnopqrstuvwxyz,TOKEN=xoxp-abcdefghijklmnopqrstuvwxyz,3,3,8,50,dev,deploy script,2026-04-03T15:45:00Z,dev@example.com,ghi789:scripts/deploy.sh:generic-api-key:3,"key,generic"
1 RuleID Commit File SymlinkFile Secret Match StartLine EndLine StartColumn EndColumn Author Message Date Email Fingerprint Tags
2 openai-api-key abc123 config/app.yml sk-proj-1234567890abcdef1234 key: sk-proj-1234567890abcdef1234 12 12 10 60 dev add config 2026-04-01T12:00:00Z dev@example.com abc123:config/app.yml:openai-api-key:12 key,openai
3 aws-access-token def456 terraform/main.tf AKIAIOSFODNN7EXAMPLE access_key = AKIAIOSFODNN7EXAMPLE 55 55 20 40 ops tf update 2026-04-02T09:30:00Z ops@example.com def456:terraform/main.tf:aws-access-token:55 key,aws
4 generic-api-key ghi789 scripts/deploy.sh xoxp-abcdefghijklmnopqrstuvwxyz TOKEN=xoxp-abcdefghijklmnopqrstuvwxyz 3 3 8 50 dev deploy script 2026-04-03T15:45:00Z dev@example.com ghi789:scripts/deploy.sh:generic-api-key:3 key,generic

View File

@@ -0,0 +1,62 @@
[
{
"Description": "OpenAI API Key",
"StartLine": 12,
"EndLine": 12,
"StartColumn": 10,
"EndColumn": 60,
"Match": "key: sk-proj-1234567890abcdef1234",
"Secret": "sk-proj-1234567890abcdef1234",
"File": "config/app.yml",
"SymlinkFile": "",
"Commit": "abc123",
"Entropy": 4.5,
"Author": "dev",
"Email": "dev@example.com",
"Date": "2026-04-01T12:00:00Z",
"Message": "add config",
"Tags": ["key", "openai"],
"RuleID": "openai-api-key",
"Fingerprint": "abc123:config/app.yml:openai-api-key:12"
},
{
"Description": "AWS Access Token",
"StartLine": 55,
"EndLine": 55,
"StartColumn": 20,
"EndColumn": 40,
"Match": "access_key = AKIAIOSFODNN7EXAMPLE",
"Secret": "AKIAIOSFODNN7EXAMPLE",
"File": "terraform/main.tf",
"SymlinkFile": "",
"Commit": "def456",
"Entropy": 4.2,
"Author": "ops",
"Email": "ops@example.com",
"Date": "2026-04-02T09:30:00Z",
"Message": "tf update",
"Tags": ["key", "aws"],
"RuleID": "aws-access-token",
"Fingerprint": "def456:terraform/main.tf:aws-access-token:55"
},
{
"Description": "Generic API Key",
"StartLine": 3,
"EndLine": 3,
"StartColumn": 8,
"EndColumn": 50,
"Match": "TOKEN=xoxp-abcdefghijklmnopqrstuvwxyz",
"Secret": "xoxp-abcdefghijklmnopqrstuvwxyz",
"File": "scripts/deploy.sh",
"SymlinkFile": "",
"Commit": "ghi789",
"Entropy": 3.8,
"Author": "dev",
"Email": "dev@example.com",
"Date": "2026-04-03T15:45:00Z",
"Message": "deploy script",
"Tags": ["key", "generic"],
"RuleID": "generic-api-key",
"Fingerprint": "ghi789:scripts/deploy.sh:generic-api-key:3"
}
]