diff --git a/cmd/import.go b/cmd/import.go new file mode 100644 index 0000000..7574667 --- /dev/null +++ b/cmd/import.go @@ -0,0 +1,132 @@ +package cmd + +import ( + "fmt" + "os" + "time" + + "github.com/spf13/cobra" + + "github.com/salvacybersec/keyhunter/pkg/engine" + "github.com/salvacybersec/keyhunter/pkg/importer" + "github.com/salvacybersec/keyhunter/pkg/storage" +) + +// importFormat holds the --format flag value for `keyhunter import`. It is +// a package-level var so tests can reset it between runs. +var importFormat string + +// importCmd wires the pkg/importer adapters to the SQLite storage layer so +// users can consolidate external scanner output (TruffleHog, Gitleaks) into +// the unified KeyHunter database. Delivers IMP-01/02/03 end-to-end. +var importCmd = &cobra.Command{ + Use: "import ", + Short: "Import findings from TruffleHog or Gitleaks output", + Long: `Import scan output from external secret scanners into the KeyHunter database. + +Supported formats: + trufflehog TruffleHog v3 JSON (trufflehog ... --json) + gitleaks Gitleaks native JSON (gitleaks detect -f json) + gitleaks-csv Gitleaks CSV (gitleaks detect -f csv) + +Imports are idempotent: repeated invocations against the same file deduplicate +against both in-file duplicates and findings already persisted in the database.`, + Args: cobra.ExactArgs(1), + RunE: runImport, +} + +func init() { + importCmd.Flags().StringVar(&importFormat, "format", "", "input format: trufflehog | gitleaks | gitleaks-csv (required)") + _ = importCmd.MarkFlagRequired("format") +} + +// runImport is the RunE for importCmd. Extracted so tests can exercise it +// without going through cobra's full argument parsing stack. +func runImport(cmd *cobra.Command, args []string) error { + path := args[0] + + imp, err := selectImporter(importFormat) + if err != nil { + return err + } + + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("opening %s: %w", path, err) + } + defer f.Close() + + findings, err := imp.Import(f) + if err != nil { + return fmt.Errorf("parsing %s output: %w", imp.Name(), err) + } + + total := len(findings) + unique, inFileDupes := importer.Dedup(findings) + + db, encKey, err := openDBWithKey() + if err != nil { + return err + } + defer db.Close() + + newCount := 0 + dbDupes := 0 + for _, finding := range unique { + exists, err := db.FindingExistsByKey(finding.ProviderName, finding.KeyMasked, finding.Source, finding.LineNumber) + if err != nil { + return fmt.Errorf("checking existing findings: %w", err) + } + if exists { + dbDupes++ + continue + } + sf := engineToStorage(finding) + if _, err := db.SaveFinding(sf, encKey); err != nil { + return fmt.Errorf("saving finding: %w", err) + } + newCount++ + } + + totalDupes := inFileDupes + dbDupes + fmt.Fprintf(cmd.OutOrStdout(), "Imported %d findings (%d new, %d duplicates)\n", total, newCount, totalDupes) + return nil +} + +// selectImporter resolves the --format flag to a concrete Importer. +func selectImporter(format string) (importer.Importer, error) { + switch format { + case "trufflehog": + return importer.TruffleHogImporter{}, nil + case "gitleaks": + return importer.GitleaksImporter{}, nil + case "gitleaks-csv": + return importer.GitleaksCSVImporter{}, nil + default: + return nil, fmt.Errorf("unknown format %q (want trufflehog | gitleaks | gitleaks-csv)", format) + } +} + +// engineToStorage converts an engine.Finding (importer output shape) into a +// storage.Finding suitable for db.SaveFinding. The field name difference +// between engine.Source and storage.SourcePath is the main reason this +// conversion exists. DetectedAt is defaulted to now if unset so imported +// records always carry a timestamp. +func engineToStorage(f engine.Finding) storage.Finding { + if f.DetectedAt.IsZero() { + f.DetectedAt = time.Now() + } + return storage.Finding{ + ProviderName: f.ProviderName, + KeyValue: f.KeyValue, + KeyMasked: f.KeyMasked, + Confidence: f.Confidence, + SourcePath: f.Source, + SourceType: f.SourceType, + LineNumber: f.LineNumber, + Verified: f.Verified, + VerifyStatus: f.VerifyStatus, + VerifyHTTPCode: f.VerifyHTTPCode, + VerifyMetadata: f.VerifyMetadata, + } +} diff --git a/cmd/import_test.go b/cmd/import_test.go new file mode 100644 index 0000000..589f6ca --- /dev/null +++ b/cmd/import_test.go @@ -0,0 +1,156 @@ +package cmd + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "github.com/salvacybersec/keyhunter/pkg/engine" + "github.com/salvacybersec/keyhunter/pkg/importer" + "github.com/salvacybersec/keyhunter/pkg/storage" + "github.com/spf13/viper" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSelectImporter(t *testing.T) { + cases := []struct { + format string + want string + wantErr bool + }{ + {"trufflehog", "trufflehog", false}, + {"gitleaks", "gitleaks", false}, + {"gitleaks-csv", "gitleaks-csv", false}, + {"bogus", "", true}, + {"", "", true}, + } + for _, tc := range cases { + t.Run(tc.format, func(t *testing.T) { + imp, err := selectImporter(tc.format) + if tc.wantErr { + assert.Error(t, err) + return + } + require.NoError(t, err) + assert.Equal(t, tc.want, imp.Name()) + // Check concrete type matches expectation via interface dispatch. + switch tc.format { + case "trufflehog": + _, ok := imp.(importer.TruffleHogImporter) + assert.True(t, ok, "expected TruffleHogImporter") + case "gitleaks": + _, ok := imp.(importer.GitleaksImporter) + assert.True(t, ok, "expected GitleaksImporter") + case "gitleaks-csv": + _, ok := imp.(importer.GitleaksCSVImporter) + assert.True(t, ok, "expected GitleaksCSVImporter") + } + }) + } +} + +func TestEngineToStorage(t *testing.T) { + ef := engine.Finding{ + ProviderName: "openai", + KeyValue: "sk-abcdefghijklmnop", + KeyMasked: "sk-abcde...mnop", + Confidence: "high", + Source: "a.yml", + SourceType: "import:trufflehog", + LineNumber: 5, + Verified: true, + VerifyStatus: "live", + VerifyHTTPCode: 200, + VerifyMetadata: map[string]string{"org": "acme"}, + } + sf := engineToStorage(ef) + assert.Equal(t, "openai", sf.ProviderName) + assert.Equal(t, "sk-abcdefghijklmnop", sf.KeyValue) + assert.Equal(t, "sk-abcde...mnop", sf.KeyMasked) + assert.Equal(t, "high", sf.Confidence) + assert.Equal(t, "a.yml", sf.SourcePath, "engine.Source -> storage.SourcePath") + assert.Equal(t, "import:trufflehog", sf.SourceType) + assert.Equal(t, 5, sf.LineNumber) + assert.True(t, sf.Verified) + assert.Equal(t, "live", sf.VerifyStatus) + assert.Equal(t, 200, sf.VerifyHTTPCode) + assert.Equal(t, map[string]string{"org": "acme"}, sf.VerifyMetadata) +} + +// seedImportDB spins up a temp SQLite database wired through viper + +// KEYHUNTER_PASSPHRASE so runImport -> openDBWithKey resolves to it. +func seedImportDB(t *testing.T) string { + t.Helper() + dir := t.TempDir() + dbPath := filepath.Join(dir, "import.db") + + viper.Reset() + viper.Set("database.path", dbPath) + t.Setenv("KEYHUNTER_PASSPHRASE", "test-pass") + + t.Cleanup(func() { + viper.Reset() + importFormat = "" + }) + return dbPath +} + +func TestRunImport_TruffleHogEndToEnd(t *testing.T) { + dbPath := seedImportDB(t) + + // Use the canonical testdata shipped by pkg/importer. + importFormat = "trufflehog" + samplePath, err := filepath.Abs(filepath.Join("..", "pkg", "importer", "testdata", "trufflehog-sample.json")) + require.NoError(t, err) + _, err = os.Stat(samplePath) + require.NoError(t, err, "testdata trufflehog-sample.json must exist") + + var out bytes.Buffer + importCmd.SetOut(&out) + importCmd.SetErr(&out) + + // First import: all findings are new. + err = runImport(importCmd, []string{samplePath}) + require.NoError(t, err) + first := out.String() + assert.Contains(t, first, "Imported 3 findings") + assert.Contains(t, first, "3 new") + assert.Contains(t, first, "0 duplicates") + + // Confirm findings landed in the database. + db, err := storage.Open(dbPath) + require.NoError(t, err) + encKey, err := loadOrCreateEncKey(db, "test-pass") + require.NoError(t, err) + stored, err := db.ListFindings(encKey) + require.NoError(t, err) + assert.GreaterOrEqual(t, len(stored), 3) + require.NoError(t, db.Close()) + + // Second import of the same file: everything should now be a duplicate. + out.Reset() + err = runImport(importCmd, []string{samplePath}) + require.NoError(t, err) + second := out.String() + assert.Contains(t, second, "Imported 3 findings") + assert.Contains(t, second, "0 new") + assert.Contains(t, second, "3 duplicates") +} + +func TestRunImport_UnknownFormat(t *testing.T) { + _ = seedImportDB(t) + importFormat = "bogus" + err := runImport(importCmd, []string{"/nonexistent"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "unknown format") +} + +func TestRunImport_MissingFile(t *testing.T) { + _ = seedImportDB(t) + importFormat = "trufflehog" + err := runImport(importCmd, []string{filepath.Join(t.TempDir(), "does-not-exist.json")}) + require.Error(t, err) + assert.Contains(t, err.Error(), "opening") +} diff --git a/pkg/storage/queries.go b/pkg/storage/queries.go index d4d85df..51ecd00 100644 --- a/pkg/storage/queries.go +++ b/pkg/storage/queries.go @@ -84,6 +84,28 @@ func (db *DB) GetFinding(id int64, encKey []byte) (*Finding, error) { return &f, nil } +// FindingExistsByKey reports whether a finding with the same provider name, +// masked key, source path, and line number already exists in the database. +// This is the identity tuple used by the import pipeline to make repeated +// imports of the same scanner output idempotent without decrypting stored +// key values. +func (db *DB) FindingExistsByKey(provider, masked, sourcePath string, line int) (bool, error) { + row := db.sql.QueryRow( + `SELECT 1 FROM findings + WHERE provider_name = ? AND key_masked = ? AND source_path = ? AND line_number = ? + LIMIT 1`, + provider, masked, sourcePath, line, + ) + var one int + if err := row.Scan(&one); err != nil { + if err == sql.ErrNoRows { + return false, nil + } + return false, fmt.Errorf("querying finding existence: %w", err) + } + return true, nil +} + // DeleteFinding removes the finding with the given id. // Returns the number of rows affected (0 if no such id). A missing id is not // an error — the caller decides whether to surface it. diff --git a/pkg/storage/queries_test.go b/pkg/storage/queries_test.go index d8ae8ad..8bc5ffc 100644 --- a/pkg/storage/queries_test.go +++ b/pkg/storage/queries_test.go @@ -147,3 +147,42 @@ func TestDeleteFinding_Miss(t *testing.T) { require.NoError(t, err) assert.Equal(t, int64(0), n) } + +func TestFindingExistsByKey(t *testing.T) { + db, encKey, _ := seedQueryFindings(t) + + // Insert a finding with a deterministic masked key we can query against. + masked := "sk-exact...1234" + _, err := db.SaveFinding(storage.Finding{ + ProviderName: "openai", + KeyValue: "sk-exact-key-value-1234", + KeyMasked: masked, + Confidence: "high", + SourcePath: "/tmp/exact.env", + SourceType: "import:trufflehog", + LineNumber: 42, + }, encKey) + require.NoError(t, err) + + // Exact tuple hits. + exists, err := db.FindingExistsByKey("openai", masked, "/tmp/exact.env", 42) + require.NoError(t, err) + assert.True(t, exists, "exact tuple should be found") + + // Any differing field misses. + miss1, err := db.FindingExistsByKey("anthropic", masked, "/tmp/exact.env", 42) + require.NoError(t, err) + assert.False(t, miss1) + + miss2, err := db.FindingExistsByKey("openai", "sk-other...9999", "/tmp/exact.env", 42) + require.NoError(t, err) + assert.False(t, miss2) + + miss3, err := db.FindingExistsByKey("openai", masked, "/tmp/other.env", 42) + require.NoError(t, err) + assert.False(t, miss3) + + miss4, err := db.FindingExistsByKey("openai", masked, "/tmp/exact.env", 7) + require.NoError(t, err) + assert.False(t, miss4) +}