feat(07-04): wire keyhunter import command with dedup and DB persist
- Replace import stub with cmd/import.go dispatching to pkg/importer (trufflehog, gitleaks, gitleaks-csv) via --format flag - Reuse openDBWithKey helper so encryption + path resolution match scan/keys - engineToStorage converts engine.Finding -> storage.Finding (Source -> SourcePath) - Add pkg/storage.FindingExistsByKey for idempotent cross-import dedup keyed on (provider, masked key, source path, line number) - cmd/import_test.go: selector table, field conversion, end-to-end trufflehog import with re-run duplicate assertion, unknown-format + missing-file errors - pkg/storage queries_test: FindingExistsByKey hit and four miss cases Delivers IMP-01/02/03 end-to-end.
This commit is contained in:
132
cmd/import.go
Normal file
132
cmd/import.go
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/spf13/cobra"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/importer"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/storage"
|
||||||
|
)
|
||||||
|
|
||||||
|
// importFormat holds the --format flag value for `keyhunter import`. It is
|
||||||
|
// a package-level var so tests can reset it between runs.
|
||||||
|
var importFormat string
|
||||||
|
|
||||||
|
// importCmd wires the pkg/importer adapters to the SQLite storage layer so
|
||||||
|
// users can consolidate external scanner output (TruffleHog, Gitleaks) into
|
||||||
|
// the unified KeyHunter database. Delivers IMP-01/02/03 end-to-end.
|
||||||
|
var importCmd = &cobra.Command{
|
||||||
|
Use: "import <file>",
|
||||||
|
Short: "Import findings from TruffleHog or Gitleaks output",
|
||||||
|
Long: `Import scan output from external secret scanners into the KeyHunter database.
|
||||||
|
|
||||||
|
Supported formats:
|
||||||
|
trufflehog TruffleHog v3 JSON (trufflehog ... --json)
|
||||||
|
gitleaks Gitleaks native JSON (gitleaks detect -f json)
|
||||||
|
gitleaks-csv Gitleaks CSV (gitleaks detect -f csv)
|
||||||
|
|
||||||
|
Imports are idempotent: repeated invocations against the same file deduplicate
|
||||||
|
against both in-file duplicates and findings already persisted in the database.`,
|
||||||
|
Args: cobra.ExactArgs(1),
|
||||||
|
RunE: runImport,
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
importCmd.Flags().StringVar(&importFormat, "format", "", "input format: trufflehog | gitleaks | gitleaks-csv (required)")
|
||||||
|
_ = importCmd.MarkFlagRequired("format")
|
||||||
|
}
|
||||||
|
|
||||||
|
// runImport is the RunE for importCmd. Extracted so tests can exercise it
|
||||||
|
// without going through cobra's full argument parsing stack.
|
||||||
|
func runImport(cmd *cobra.Command, args []string) error {
|
||||||
|
path := args[0]
|
||||||
|
|
||||||
|
imp, err := selectImporter(importFormat)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("opening %s: %w", path, err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
findings, err := imp.Import(f)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("parsing %s output: %w", imp.Name(), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
total := len(findings)
|
||||||
|
unique, inFileDupes := importer.Dedup(findings)
|
||||||
|
|
||||||
|
db, encKey, err := openDBWithKey()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
newCount := 0
|
||||||
|
dbDupes := 0
|
||||||
|
for _, finding := range unique {
|
||||||
|
exists, err := db.FindingExistsByKey(finding.ProviderName, finding.KeyMasked, finding.Source, finding.LineNumber)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("checking existing findings: %w", err)
|
||||||
|
}
|
||||||
|
if exists {
|
||||||
|
dbDupes++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sf := engineToStorage(finding)
|
||||||
|
if _, err := db.SaveFinding(sf, encKey); err != nil {
|
||||||
|
return fmt.Errorf("saving finding: %w", err)
|
||||||
|
}
|
||||||
|
newCount++
|
||||||
|
}
|
||||||
|
|
||||||
|
totalDupes := inFileDupes + dbDupes
|
||||||
|
fmt.Fprintf(cmd.OutOrStdout(), "Imported %d findings (%d new, %d duplicates)\n", total, newCount, totalDupes)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// selectImporter resolves the --format flag to a concrete Importer.
|
||||||
|
func selectImporter(format string) (importer.Importer, error) {
|
||||||
|
switch format {
|
||||||
|
case "trufflehog":
|
||||||
|
return importer.TruffleHogImporter{}, nil
|
||||||
|
case "gitleaks":
|
||||||
|
return importer.GitleaksImporter{}, nil
|
||||||
|
case "gitleaks-csv":
|
||||||
|
return importer.GitleaksCSVImporter{}, nil
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unknown format %q (want trufflehog | gitleaks | gitleaks-csv)", format)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// engineToStorage converts an engine.Finding (importer output shape) into a
|
||||||
|
// storage.Finding suitable for db.SaveFinding. The field name difference
|
||||||
|
// between engine.Source and storage.SourcePath is the main reason this
|
||||||
|
// conversion exists. DetectedAt is defaulted to now if unset so imported
|
||||||
|
// records always carry a timestamp.
|
||||||
|
func engineToStorage(f engine.Finding) storage.Finding {
|
||||||
|
if f.DetectedAt.IsZero() {
|
||||||
|
f.DetectedAt = time.Now()
|
||||||
|
}
|
||||||
|
return storage.Finding{
|
||||||
|
ProviderName: f.ProviderName,
|
||||||
|
KeyValue: f.KeyValue,
|
||||||
|
KeyMasked: f.KeyMasked,
|
||||||
|
Confidence: f.Confidence,
|
||||||
|
SourcePath: f.Source,
|
||||||
|
SourceType: f.SourceType,
|
||||||
|
LineNumber: f.LineNumber,
|
||||||
|
Verified: f.Verified,
|
||||||
|
VerifyStatus: f.VerifyStatus,
|
||||||
|
VerifyHTTPCode: f.VerifyHTTPCode,
|
||||||
|
VerifyMetadata: f.VerifyMetadata,
|
||||||
|
}
|
||||||
|
}
|
||||||
156
cmd/import_test.go
Normal file
156
cmd/import_test.go
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/engine"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/importer"
|
||||||
|
"github.com/salvacybersec/keyhunter/pkg/storage"
|
||||||
|
"github.com/spf13/viper"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSelectImporter(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
format string
|
||||||
|
want string
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{"trufflehog", "trufflehog", false},
|
||||||
|
{"gitleaks", "gitleaks", false},
|
||||||
|
{"gitleaks-csv", "gitleaks-csv", false},
|
||||||
|
{"bogus", "", true},
|
||||||
|
{"", "", true},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.format, func(t *testing.T) {
|
||||||
|
imp, err := selectImporter(tc.format)
|
||||||
|
if tc.wantErr {
|
||||||
|
assert.Error(t, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, tc.want, imp.Name())
|
||||||
|
// Check concrete type matches expectation via interface dispatch.
|
||||||
|
switch tc.format {
|
||||||
|
case "trufflehog":
|
||||||
|
_, ok := imp.(importer.TruffleHogImporter)
|
||||||
|
assert.True(t, ok, "expected TruffleHogImporter")
|
||||||
|
case "gitleaks":
|
||||||
|
_, ok := imp.(importer.GitleaksImporter)
|
||||||
|
assert.True(t, ok, "expected GitleaksImporter")
|
||||||
|
case "gitleaks-csv":
|
||||||
|
_, ok := imp.(importer.GitleaksCSVImporter)
|
||||||
|
assert.True(t, ok, "expected GitleaksCSVImporter")
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEngineToStorage(t *testing.T) {
|
||||||
|
ef := engine.Finding{
|
||||||
|
ProviderName: "openai",
|
||||||
|
KeyValue: "sk-abcdefghijklmnop",
|
||||||
|
KeyMasked: "sk-abcde...mnop",
|
||||||
|
Confidence: "high",
|
||||||
|
Source: "a.yml",
|
||||||
|
SourceType: "import:trufflehog",
|
||||||
|
LineNumber: 5,
|
||||||
|
Verified: true,
|
||||||
|
VerifyStatus: "live",
|
||||||
|
VerifyHTTPCode: 200,
|
||||||
|
VerifyMetadata: map[string]string{"org": "acme"},
|
||||||
|
}
|
||||||
|
sf := engineToStorage(ef)
|
||||||
|
assert.Equal(t, "openai", sf.ProviderName)
|
||||||
|
assert.Equal(t, "sk-abcdefghijklmnop", sf.KeyValue)
|
||||||
|
assert.Equal(t, "sk-abcde...mnop", sf.KeyMasked)
|
||||||
|
assert.Equal(t, "high", sf.Confidence)
|
||||||
|
assert.Equal(t, "a.yml", sf.SourcePath, "engine.Source -> storage.SourcePath")
|
||||||
|
assert.Equal(t, "import:trufflehog", sf.SourceType)
|
||||||
|
assert.Equal(t, 5, sf.LineNumber)
|
||||||
|
assert.True(t, sf.Verified)
|
||||||
|
assert.Equal(t, "live", sf.VerifyStatus)
|
||||||
|
assert.Equal(t, 200, sf.VerifyHTTPCode)
|
||||||
|
assert.Equal(t, map[string]string{"org": "acme"}, sf.VerifyMetadata)
|
||||||
|
}
|
||||||
|
|
||||||
|
// seedImportDB spins up a temp SQLite database wired through viper +
|
||||||
|
// KEYHUNTER_PASSPHRASE so runImport -> openDBWithKey resolves to it.
|
||||||
|
func seedImportDB(t *testing.T) string {
|
||||||
|
t.Helper()
|
||||||
|
dir := t.TempDir()
|
||||||
|
dbPath := filepath.Join(dir, "import.db")
|
||||||
|
|
||||||
|
viper.Reset()
|
||||||
|
viper.Set("database.path", dbPath)
|
||||||
|
t.Setenv("KEYHUNTER_PASSPHRASE", "test-pass")
|
||||||
|
|
||||||
|
t.Cleanup(func() {
|
||||||
|
viper.Reset()
|
||||||
|
importFormat = ""
|
||||||
|
})
|
||||||
|
return dbPath
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunImport_TruffleHogEndToEnd(t *testing.T) {
|
||||||
|
dbPath := seedImportDB(t)
|
||||||
|
|
||||||
|
// Use the canonical testdata shipped by pkg/importer.
|
||||||
|
importFormat = "trufflehog"
|
||||||
|
samplePath, err := filepath.Abs(filepath.Join("..", "pkg", "importer", "testdata", "trufflehog-sample.json"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
_, err = os.Stat(samplePath)
|
||||||
|
require.NoError(t, err, "testdata trufflehog-sample.json must exist")
|
||||||
|
|
||||||
|
var out bytes.Buffer
|
||||||
|
importCmd.SetOut(&out)
|
||||||
|
importCmd.SetErr(&out)
|
||||||
|
|
||||||
|
// First import: all findings are new.
|
||||||
|
err = runImport(importCmd, []string{samplePath})
|
||||||
|
require.NoError(t, err)
|
||||||
|
first := out.String()
|
||||||
|
assert.Contains(t, first, "Imported 3 findings")
|
||||||
|
assert.Contains(t, first, "3 new")
|
||||||
|
assert.Contains(t, first, "0 duplicates")
|
||||||
|
|
||||||
|
// Confirm findings landed in the database.
|
||||||
|
db, err := storage.Open(dbPath)
|
||||||
|
require.NoError(t, err)
|
||||||
|
encKey, err := loadOrCreateEncKey(db, "test-pass")
|
||||||
|
require.NoError(t, err)
|
||||||
|
stored, err := db.ListFindings(encKey)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.GreaterOrEqual(t, len(stored), 3)
|
||||||
|
require.NoError(t, db.Close())
|
||||||
|
|
||||||
|
// Second import of the same file: everything should now be a duplicate.
|
||||||
|
out.Reset()
|
||||||
|
err = runImport(importCmd, []string{samplePath})
|
||||||
|
require.NoError(t, err)
|
||||||
|
second := out.String()
|
||||||
|
assert.Contains(t, second, "Imported 3 findings")
|
||||||
|
assert.Contains(t, second, "0 new")
|
||||||
|
assert.Contains(t, second, "3 duplicates")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunImport_UnknownFormat(t *testing.T) {
|
||||||
|
_ = seedImportDB(t)
|
||||||
|
importFormat = "bogus"
|
||||||
|
err := runImport(importCmd, []string{"/nonexistent"})
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), "unknown format")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRunImport_MissingFile(t *testing.T) {
|
||||||
|
_ = seedImportDB(t)
|
||||||
|
importFormat = "trufflehog"
|
||||||
|
err := runImport(importCmd, []string{filepath.Join(t.TempDir(), "does-not-exist.json")})
|
||||||
|
require.Error(t, err)
|
||||||
|
assert.Contains(t, err.Error(), "opening")
|
||||||
|
}
|
||||||
@@ -84,6 +84,28 @@ func (db *DB) GetFinding(id int64, encKey []byte) (*Finding, error) {
|
|||||||
return &f, nil
|
return &f, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FindingExistsByKey reports whether a finding with the same provider name,
|
||||||
|
// masked key, source path, and line number already exists in the database.
|
||||||
|
// This is the identity tuple used by the import pipeline to make repeated
|
||||||
|
// imports of the same scanner output idempotent without decrypting stored
|
||||||
|
// key values.
|
||||||
|
func (db *DB) FindingExistsByKey(provider, masked, sourcePath string, line int) (bool, error) {
|
||||||
|
row := db.sql.QueryRow(
|
||||||
|
`SELECT 1 FROM findings
|
||||||
|
WHERE provider_name = ? AND key_masked = ? AND source_path = ? AND line_number = ?
|
||||||
|
LIMIT 1`,
|
||||||
|
provider, masked, sourcePath, line,
|
||||||
|
)
|
||||||
|
var one int
|
||||||
|
if err := row.Scan(&one); err != nil {
|
||||||
|
if err == sql.ErrNoRows {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return false, fmt.Errorf("querying finding existence: %w", err)
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
// DeleteFinding removes the finding with the given id.
|
// DeleteFinding removes the finding with the given id.
|
||||||
// Returns the number of rows affected (0 if no such id). A missing id is not
|
// Returns the number of rows affected (0 if no such id). A missing id is not
|
||||||
// an error — the caller decides whether to surface it.
|
// an error — the caller decides whether to surface it.
|
||||||
|
|||||||
@@ -147,3 +147,42 @@ func TestDeleteFinding_Miss(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
assert.Equal(t, int64(0), n)
|
assert.Equal(t, int64(0), n)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFindingExistsByKey(t *testing.T) {
|
||||||
|
db, encKey, _ := seedQueryFindings(t)
|
||||||
|
|
||||||
|
// Insert a finding with a deterministic masked key we can query against.
|
||||||
|
masked := "sk-exact...1234"
|
||||||
|
_, err := db.SaveFinding(storage.Finding{
|
||||||
|
ProviderName: "openai",
|
||||||
|
KeyValue: "sk-exact-key-value-1234",
|
||||||
|
KeyMasked: masked,
|
||||||
|
Confidence: "high",
|
||||||
|
SourcePath: "/tmp/exact.env",
|
||||||
|
SourceType: "import:trufflehog",
|
||||||
|
LineNumber: 42,
|
||||||
|
}, encKey)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Exact tuple hits.
|
||||||
|
exists, err := db.FindingExistsByKey("openai", masked, "/tmp/exact.env", 42)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.True(t, exists, "exact tuple should be found")
|
||||||
|
|
||||||
|
// Any differing field misses.
|
||||||
|
miss1, err := db.FindingExistsByKey("anthropic", masked, "/tmp/exact.env", 42)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.False(t, miss1)
|
||||||
|
|
||||||
|
miss2, err := db.FindingExistsByKey("openai", "sk-other...9999", "/tmp/exact.env", 42)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.False(t, miss2)
|
||||||
|
|
||||||
|
miss3, err := db.FindingExistsByKey("openai", masked, "/tmp/other.env", 42)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.False(t, miss3)
|
||||||
|
|
||||||
|
miss4, err := db.FindingExistsByKey("openai", masked, "/tmp/exact.env", 7)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.False(t, miss4)
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user