feat(07-04): wire keyhunter import command with dedup and DB persist

- Replace import stub with cmd/import.go dispatching to pkg/importer
  (trufflehog, gitleaks, gitleaks-csv) via --format flag
- Reuse openDBWithKey helper so encryption + path resolution match scan/keys
- engineToStorage converts engine.Finding -> storage.Finding (Source -> SourcePath)
- Add pkg/storage.FindingExistsByKey for idempotent cross-import dedup
  keyed on (provider, masked key, source path, line number)
- cmd/import_test.go: selector table, field conversion, end-to-end trufflehog
  import with re-run duplicate assertion, unknown-format + missing-file errors
- pkg/storage queries_test: FindingExistsByKey hit and four miss cases

Delivers IMP-01/02/03 end-to-end.
This commit is contained in:
salvacybersec
2026-04-05 23:59:39 +03:00
parent b3db22ac93
commit 9dbb0b87d4
4 changed files with 349 additions and 0 deletions

132
cmd/import.go Normal file
View File

@@ -0,0 +1,132 @@
package cmd
import (
"fmt"
"os"
"time"
"github.com/spf13/cobra"
"github.com/salvacybersec/keyhunter/pkg/engine"
"github.com/salvacybersec/keyhunter/pkg/importer"
"github.com/salvacybersec/keyhunter/pkg/storage"
)
// importFormat holds the --format flag value for `keyhunter import`. It is
// a package-level var so tests can reset it between runs.
var importFormat string
// importCmd wires the pkg/importer adapters to the SQLite storage layer so
// users can consolidate external scanner output (TruffleHog, Gitleaks) into
// the unified KeyHunter database. Delivers IMP-01/02/03 end-to-end.
var importCmd = &cobra.Command{
Use: "import <file>",
Short: "Import findings from TruffleHog or Gitleaks output",
Long: `Import scan output from external secret scanners into the KeyHunter database.
Supported formats:
trufflehog TruffleHog v3 JSON (trufflehog ... --json)
gitleaks Gitleaks native JSON (gitleaks detect -f json)
gitleaks-csv Gitleaks CSV (gitleaks detect -f csv)
Imports are idempotent: repeated invocations against the same file deduplicate
against both in-file duplicates and findings already persisted in the database.`,
Args: cobra.ExactArgs(1),
RunE: runImport,
}
func init() {
importCmd.Flags().StringVar(&importFormat, "format", "", "input format: trufflehog | gitleaks | gitleaks-csv (required)")
_ = importCmd.MarkFlagRequired("format")
}
// runImport is the RunE for importCmd. Extracted so tests can exercise it
// without going through cobra's full argument parsing stack.
func runImport(cmd *cobra.Command, args []string) error {
path := args[0]
imp, err := selectImporter(importFormat)
if err != nil {
return err
}
f, err := os.Open(path)
if err != nil {
return fmt.Errorf("opening %s: %w", path, err)
}
defer f.Close()
findings, err := imp.Import(f)
if err != nil {
return fmt.Errorf("parsing %s output: %w", imp.Name(), err)
}
total := len(findings)
unique, inFileDupes := importer.Dedup(findings)
db, encKey, err := openDBWithKey()
if err != nil {
return err
}
defer db.Close()
newCount := 0
dbDupes := 0
for _, finding := range unique {
exists, err := db.FindingExistsByKey(finding.ProviderName, finding.KeyMasked, finding.Source, finding.LineNumber)
if err != nil {
return fmt.Errorf("checking existing findings: %w", err)
}
if exists {
dbDupes++
continue
}
sf := engineToStorage(finding)
if _, err := db.SaveFinding(sf, encKey); err != nil {
return fmt.Errorf("saving finding: %w", err)
}
newCount++
}
totalDupes := inFileDupes + dbDupes
fmt.Fprintf(cmd.OutOrStdout(), "Imported %d findings (%d new, %d duplicates)\n", total, newCount, totalDupes)
return nil
}
// selectImporter resolves the --format flag to a concrete Importer.
func selectImporter(format string) (importer.Importer, error) {
switch format {
case "trufflehog":
return importer.TruffleHogImporter{}, nil
case "gitleaks":
return importer.GitleaksImporter{}, nil
case "gitleaks-csv":
return importer.GitleaksCSVImporter{}, nil
default:
return nil, fmt.Errorf("unknown format %q (want trufflehog | gitleaks | gitleaks-csv)", format)
}
}
// engineToStorage converts an engine.Finding (importer output shape) into a
// storage.Finding suitable for db.SaveFinding. The field name difference
// between engine.Source and storage.SourcePath is the main reason this
// conversion exists. DetectedAt is defaulted to now if unset so imported
// records always carry a timestamp.
func engineToStorage(f engine.Finding) storage.Finding {
if f.DetectedAt.IsZero() {
f.DetectedAt = time.Now()
}
return storage.Finding{
ProviderName: f.ProviderName,
KeyValue: f.KeyValue,
KeyMasked: f.KeyMasked,
Confidence: f.Confidence,
SourcePath: f.Source,
SourceType: f.SourceType,
LineNumber: f.LineNumber,
Verified: f.Verified,
VerifyStatus: f.VerifyStatus,
VerifyHTTPCode: f.VerifyHTTPCode,
VerifyMetadata: f.VerifyMetadata,
}
}

156
cmd/import_test.go Normal file
View File

@@ -0,0 +1,156 @@
package cmd
import (
"bytes"
"os"
"path/filepath"
"testing"
"github.com/salvacybersec/keyhunter/pkg/engine"
"github.com/salvacybersec/keyhunter/pkg/importer"
"github.com/salvacybersec/keyhunter/pkg/storage"
"github.com/spf13/viper"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestSelectImporter(t *testing.T) {
cases := []struct {
format string
want string
wantErr bool
}{
{"trufflehog", "trufflehog", false},
{"gitleaks", "gitleaks", false},
{"gitleaks-csv", "gitleaks-csv", false},
{"bogus", "", true},
{"", "", true},
}
for _, tc := range cases {
t.Run(tc.format, func(t *testing.T) {
imp, err := selectImporter(tc.format)
if tc.wantErr {
assert.Error(t, err)
return
}
require.NoError(t, err)
assert.Equal(t, tc.want, imp.Name())
// Check concrete type matches expectation via interface dispatch.
switch tc.format {
case "trufflehog":
_, ok := imp.(importer.TruffleHogImporter)
assert.True(t, ok, "expected TruffleHogImporter")
case "gitleaks":
_, ok := imp.(importer.GitleaksImporter)
assert.True(t, ok, "expected GitleaksImporter")
case "gitleaks-csv":
_, ok := imp.(importer.GitleaksCSVImporter)
assert.True(t, ok, "expected GitleaksCSVImporter")
}
})
}
}
func TestEngineToStorage(t *testing.T) {
ef := engine.Finding{
ProviderName: "openai",
KeyValue: "sk-abcdefghijklmnop",
KeyMasked: "sk-abcde...mnop",
Confidence: "high",
Source: "a.yml",
SourceType: "import:trufflehog",
LineNumber: 5,
Verified: true,
VerifyStatus: "live",
VerifyHTTPCode: 200,
VerifyMetadata: map[string]string{"org": "acme"},
}
sf := engineToStorage(ef)
assert.Equal(t, "openai", sf.ProviderName)
assert.Equal(t, "sk-abcdefghijklmnop", sf.KeyValue)
assert.Equal(t, "sk-abcde...mnop", sf.KeyMasked)
assert.Equal(t, "high", sf.Confidence)
assert.Equal(t, "a.yml", sf.SourcePath, "engine.Source -> storage.SourcePath")
assert.Equal(t, "import:trufflehog", sf.SourceType)
assert.Equal(t, 5, sf.LineNumber)
assert.True(t, sf.Verified)
assert.Equal(t, "live", sf.VerifyStatus)
assert.Equal(t, 200, sf.VerifyHTTPCode)
assert.Equal(t, map[string]string{"org": "acme"}, sf.VerifyMetadata)
}
// seedImportDB spins up a temp SQLite database wired through viper +
// KEYHUNTER_PASSPHRASE so runImport -> openDBWithKey resolves to it.
func seedImportDB(t *testing.T) string {
t.Helper()
dir := t.TempDir()
dbPath := filepath.Join(dir, "import.db")
viper.Reset()
viper.Set("database.path", dbPath)
t.Setenv("KEYHUNTER_PASSPHRASE", "test-pass")
t.Cleanup(func() {
viper.Reset()
importFormat = ""
})
return dbPath
}
func TestRunImport_TruffleHogEndToEnd(t *testing.T) {
dbPath := seedImportDB(t)
// Use the canonical testdata shipped by pkg/importer.
importFormat = "trufflehog"
samplePath, err := filepath.Abs(filepath.Join("..", "pkg", "importer", "testdata", "trufflehog-sample.json"))
require.NoError(t, err)
_, err = os.Stat(samplePath)
require.NoError(t, err, "testdata trufflehog-sample.json must exist")
var out bytes.Buffer
importCmd.SetOut(&out)
importCmd.SetErr(&out)
// First import: all findings are new.
err = runImport(importCmd, []string{samplePath})
require.NoError(t, err)
first := out.String()
assert.Contains(t, first, "Imported 3 findings")
assert.Contains(t, first, "3 new")
assert.Contains(t, first, "0 duplicates")
// Confirm findings landed in the database.
db, err := storage.Open(dbPath)
require.NoError(t, err)
encKey, err := loadOrCreateEncKey(db, "test-pass")
require.NoError(t, err)
stored, err := db.ListFindings(encKey)
require.NoError(t, err)
assert.GreaterOrEqual(t, len(stored), 3)
require.NoError(t, db.Close())
// Second import of the same file: everything should now be a duplicate.
out.Reset()
err = runImport(importCmd, []string{samplePath})
require.NoError(t, err)
second := out.String()
assert.Contains(t, second, "Imported 3 findings")
assert.Contains(t, second, "0 new")
assert.Contains(t, second, "3 duplicates")
}
func TestRunImport_UnknownFormat(t *testing.T) {
_ = seedImportDB(t)
importFormat = "bogus"
err := runImport(importCmd, []string{"/nonexistent"})
require.Error(t, err)
assert.Contains(t, err.Error(), "unknown format")
}
func TestRunImport_MissingFile(t *testing.T) {
_ = seedImportDB(t)
importFormat = "trufflehog"
err := runImport(importCmd, []string{filepath.Join(t.TempDir(), "does-not-exist.json")})
require.Error(t, err)
assert.Contains(t, err.Error(), "opening")
}